netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 3/3] bonding: Improve IGMP join processing
@ 2007-03-01  1:03 Jay Vosburgh
  2007-03-01 16:49 ` Andy Gospodarek
  0 siblings, 1 reply; 15+ messages in thread
From: Jay Vosburgh @ 2007-03-01  1:03 UTC (permalink / raw)
  To: netdev, bonding-devel; +Cc: Jeff Garzik


	In active-backup mode, the current bonding code duplicates IGMP
traffic to all slaves, so that switches are up to date in case of a
failover from an active to a backup interface.  If bonding then fails
back to the original active interface, it is likely that the "active
slave" switch's IGMP forwarding for the port will be out of date until
some event occurs to refresh the switch (e.g., a membership query).

	This patch alters the behavior of bonding to no longer flood
IGMP to all ports, and to issue IGMP JOINs to the newly active port at
the time of a failover.  This insures that switches are kept up to date
for all cases.

	"GOELLESCH Niels" <niels.goellesch@eurocontrol.int> originally
reported this problem, and included a patch.  His original patch was
modified by Jay Vosburgh to additionally remove the existing IGMP flood
behavior, use RCU, streamline code paths, fix trailing white space, and
adjust for style.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>


diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 7ec6121..338d452 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -60,6 +60,7 @@ #include <asm/uaccess.h>
 #include <linux/errno.h>
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
+#include <linux/igmp.h>
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
@@ -861,6 +862,28 @@ static void bond_mc_delete(struct bondin
 	}
 }
 
+
+/*
+ * Retrieve the list of registered multicast addresses for the bonding
+ * device and retransmit an IGMP JOIN request to the current active
+ * slave.
+ */
+static void bond_resend_igmp_join_requests(struct bonding *bond)
+{
+	struct in_device *in_dev;
+	struct ip_mc_list *im;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(bond->dev);
+	if (in_dev) {
+		for (im = in_dev->mc_list; im; im = im->next) {
+			ip_mc_rejoin_group(im);
+		}
+	}
+
+	rcu_read_unlock();
+}
+
 /*
  * Totally destroys the mc_list in bond
  */
@@ -874,6 +897,7 @@ static void bond_mc_list_destroy(struct 
 		kfree(dmi);
 		dmi = bond->mc_list;
 	}
+        bond->mc_list = NULL;
 }
 
 /*
@@ -967,6 +991,7 @@ static void bond_mc_swap(struct bonding 
 		for (dmi = bond->dev->mc_list; dmi; dmi = dmi->next) {
 			dev_mc_add(new_active->dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
 		}
+		bond_resend_igmp_join_requests(bond);
 	}
 }
 
@@ -4017,42 +4042,6 @@ out:
 	return 0;
 }
 
-static void bond_activebackup_xmit_copy(struct sk_buff *skb,
-                                        struct bonding *bond,
-                                        struct slave *slave)
-{
-	struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
-	struct ethhdr *eth_data;
-	u8 *hwaddr;
-	int res;
-
-	if (!skb2) {
-		printk(KERN_ERR DRV_NAME ": Error: "
-		       "bond_activebackup_xmit_copy(): skb_copy() failed\n");
-		return;
-	}
-
-	skb2->mac.raw = (unsigned char *)skb2->data;
-	eth_data = eth_hdr(skb2);
-
-	/* Pick an appropriate source MAC address
-	 *	-- use slave's perm MAC addr, unless used by bond
-	 *	-- otherwise, borrow active slave's perm MAC addr
-	 *	   since that will not be used
-	 */
-	hwaddr = slave->perm_hwaddr;
-	if (!memcmp(eth_data->h_source, hwaddr, ETH_ALEN))
-		hwaddr = bond->curr_active_slave->perm_hwaddr;
-
-	/* Set source MAC address appropriately */
-	memcpy(eth_data->h_source, hwaddr, ETH_ALEN);
-
-	res = bond_dev_queue_xmit(bond, skb2, slave->dev);
-	if (res)
-		dev_kfree_skb(skb2);
-
-	return;
-}
 
 /*
  * in active-backup mode, we know that bond->curr_active_slave is always valid if
@@ -4073,21 +4062,6 @@ static int bond_xmit_activebackup(struct
 	if (!bond->curr_active_slave)
 		goto out;
 
-	/* Xmit IGMP frames on all slaves to ensure rapid fail-over
-	   for multicast traffic on snooping switches */
-	if (skb->protocol == __constant_htons(ETH_P_IP) &&
-	    skb->nh.iph->protocol == IPPROTO_IGMP) {
-		struct slave *slave, *active_slave;
-		int i;
-
-		active_slave = bond->curr_active_slave;
-		bond_for_each_slave_from_to(bond, slave, i, active_slave->next,
-		                            active_slave->prev)
-			if (IS_UP(slave->dev) &&
-			    (slave->link == BOND_LINK_UP))
-				bond_activebackup_xmit_copy(skb, bond, slave);
-	}
-
 	res = bond_dev_queue_xmit(bond, skb, bond->curr_active_slave->dev);
 
 out:
diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 9dbb525..a113fe6 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -218,5 +218,7 @@ extern void ip_mc_up(struct in_device *)
 extern void ip_mc_down(struct in_device *);
 extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
 extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
+extern void ip_mc_rejoin_group(struct ip_mc_list *im);
+
 #endif
 #endif
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 0637213..1c6a084 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1251,6 +1251,28 @@ out:
 }
 
 /*
+ *	Resend IGMP JOIN report; used for bonding.
+ */
+void ip_mc_rejoin_group(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
+		return;
+	}
+	/* else, v3 */
+	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	igmp_ifc_event(in_dev);
+#endif
+}
+
+/*
  *	A socket has left a multicast group on device dev
  */
 
@@ -2596,3 +2618,4 @@ #endif
 EXPORT_SYMBOL(ip_mc_dec_group);
 EXPORT_SYMBOL(ip_mc_inc_group);
 EXPORT_SYMBOL(ip_mc_join_group);
+EXPORT_SYMBOL(ip_mc_rejoin_group);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01  1:03 [PATCH 3/3] bonding: Improve IGMP join processing Jay Vosburgh
@ 2007-03-01 16:49 ` Andy Gospodarek
  2007-03-01 17:05   ` Jay Vosburgh
  0 siblings, 1 reply; 15+ messages in thread
From: Andy Gospodarek @ 2007-03-01 16:49 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, bonding-devel, Jeff Garzik

On Wed, Feb 28, 2007 at 05:03:37PM -0800, Jay Vosburgh wrote:
> 
> 	In active-backup mode, the current bonding code duplicates IGMP
> traffic to all slaves, so that switches are up to date in case of a
> failover from an active to a backup interface.  If bonding then fails
> back to the original active interface, it is likely that the "active
> slave" switch's IGMP forwarding for the port will be out of date until
> some event occurs to refresh the switch (e.g., a membership query).
> 
> 	This patch alters the behavior of bonding to no longer flood
> IGMP to all ports, and to issue IGMP JOINs to the newly active port at
> the time of a failover.  This insures that switches are kept up to date
> for all cases.
> 
> 	"GOELLESCH Niels" <niels.goellesch@eurocontrol.int> originally
> reported this problem, and included a patch.  His original patch was
> modified by Jay Vosburgh to additionally remove the existing IGMP flood
> behavior, use RCU, streamline code paths, fix trailing white space, and
> adjust for style.
> 
> Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
> 

My only concern is that this code assumes all mcast addresses stored in
dev->mc-list list are for ipv4 igmp mcast addresses and nothing was done
for ipv6.

But this is much better than what we have now, so... 

Acked-by: Andy Gospodarek <andy@greyhouse.net>


> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 7ec6121..338d452 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -60,6 +60,7 @@ #include <asm/uaccess.h>
>  #include <linux/errno.h>
>  #include <linux/netdevice.h>
>  #include <linux/inetdevice.h>
> +#include <linux/igmp.h>
>  #include <linux/etherdevice.h>
>  #include <linux/skbuff.h>
>  #include <net/sock.h>
> @@ -861,6 +862,28 @@ static void bond_mc_delete(struct bondin
>  	}
>  }
>  
> +
> +/*
> + * Retrieve the list of registered multicast addresses for the bonding
> + * device and retransmit an IGMP JOIN request to the current active
> + * slave.
> + */
> +static void bond_resend_igmp_join_requests(struct bonding *bond)
> +{
> +	struct in_device *in_dev;
> +	struct ip_mc_list *im;
> +
> +	rcu_read_lock();
> +	in_dev = __in_dev_get_rcu(bond->dev);
> +	if (in_dev) {
> +		for (im = in_dev->mc_list; im; im = im->next) {
> +			ip_mc_rejoin_group(im);
> +		}
> +	}
> +
> +	rcu_read_unlock();
> +}
> +
>  /*
>   * Totally destroys the mc_list in bond
>   */
> @@ -874,6 +897,7 @@ static void bond_mc_list_destroy(struct 
>  		kfree(dmi);
>  		dmi = bond->mc_list;
>  	}
> +        bond->mc_list = NULL;
>  }
>  
>  /*
> @@ -967,6 +991,7 @@ static void bond_mc_swap(struct bonding 
>  		for (dmi = bond->dev->mc_list; dmi; dmi = dmi->next) {
>  			dev_mc_add(new_active->dev, dmi->dmi_addr, dmi->dmi_addrlen, 0);
>  		}
> +		bond_resend_igmp_join_requests(bond);
>  	}
>  }
>  
> @@ -4017,42 +4042,6 @@ out:
>  	return 0;
>  }
>  
> -static void bond_activebackup_xmit_copy(struct sk_buff *skb,
> -                                        struct bonding *bond,
> -                                        struct slave *slave)
> -{
> -	struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
> -	struct ethhdr *eth_data;
> -	u8 *hwaddr;
> -	int res;
> -
> -	if (!skb2) {
> -		printk(KERN_ERR DRV_NAME ": Error: "
> -		       "bond_activebackup_xmit_copy(): skb_copy() failed\n");
> -		return;
> -	}
> -
> -	skb2->mac.raw = (unsigned char *)skb2->data;
> -	eth_data = eth_hdr(skb2);
> -
> -	/* Pick an appropriate source MAC address
> -	 *	-- use slave's perm MAC addr, unless used by bond
> -	 *	-- otherwise, borrow active slave's perm MAC addr
> -	 *	   since that will not be used
> -	 */
> -	hwaddr = slave->perm_hwaddr;
> -	if (!memcmp(eth_data->h_source, hwaddr, ETH_ALEN))
> -		hwaddr = bond->curr_active_slave->perm_hwaddr;
> -
> -	/* Set source MAC address appropriately */
> -	memcpy(eth_data->h_source, hwaddr, ETH_ALEN);
> -
> -	res = bond_dev_queue_xmit(bond, skb2, slave->dev);
> -	if (res)
> -		dev_kfree_skb(skb2);
> -
> -	return;
> -}
>  
>  /*
>   * in active-backup mode, we know that bond->curr_active_slave is always valid if
> @@ -4073,21 +4062,6 @@ static int bond_xmit_activebackup(struct
>  	if (!bond->curr_active_slave)
>  		goto out;
>  
> -	/* Xmit IGMP frames on all slaves to ensure rapid fail-over
> -	   for multicast traffic on snooping switches */
> -	if (skb->protocol == __constant_htons(ETH_P_IP) &&
> -	    skb->nh.iph->protocol == IPPROTO_IGMP) {
> -		struct slave *slave, *active_slave;
> -		int i;
> -
> -		active_slave = bond->curr_active_slave;
> -		bond_for_each_slave_from_to(bond, slave, i, active_slave->next,
> -		                            active_slave->prev)
> -			if (IS_UP(slave->dev) &&
> -			    (slave->link == BOND_LINK_UP))
> -				bond_activebackup_xmit_copy(skb, bond, slave);
> -	}
> -
>  	res = bond_dev_queue_xmit(bond, skb, bond->curr_active_slave->dev);
>  
>  out:
> diff --git a/include/linux/igmp.h b/include/linux/igmp.h
> index 9dbb525..a113fe6 100644
> --- a/include/linux/igmp.h
> +++ b/include/linux/igmp.h
> @@ -218,5 +218,7 @@ extern void ip_mc_up(struct in_device *)
>  extern void ip_mc_down(struct in_device *);
>  extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
>  extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
> +extern void ip_mc_rejoin_group(struct ip_mc_list *im);
> +
>  #endif
>  #endif
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index 0637213..1c6a084 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -1251,6 +1251,28 @@ out:
>  }
>  
>  /*
> + *	Resend IGMP JOIN report; used for bonding.
> + */
> +void ip_mc_rejoin_group(struct ip_mc_list *im)
> +{
> +	struct in_device *in_dev = im->interface;
> +
> +#ifdef CONFIG_IP_MULTICAST
> +	if (im->multiaddr == IGMP_ALL_HOSTS)
> +		return;
> +
> +	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
> +		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
> +		return;
> +	}
> +	/* else, v3 */
> +	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
> +		IGMP_Unsolicited_Report_Count;
> +	igmp_ifc_event(in_dev);
> +#endif
> +}
> +
> +/*
>   *	A socket has left a multicast group on device dev
>   */
>  
> @@ -2596,3 +2618,4 @@ #endif
>  EXPORT_SYMBOL(ip_mc_dec_group);
>  EXPORT_SYMBOL(ip_mc_inc_group);
>  EXPORT_SYMBOL(ip_mc_join_group);
> +EXPORT_SYMBOL(ip_mc_rejoin_group);
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01 16:49 ` Andy Gospodarek
@ 2007-03-01 17:05   ` Jay Vosburgh
  2007-03-01 19:25     ` Brian Haley
  0 siblings, 1 reply; 15+ messages in thread
From: Jay Vosburgh @ 2007-03-01 17:05 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: netdev, bonding-devel, Jeff Garzik

Andy Gospodarek <andy@greyhouse.net> wrote:

>On Wed, Feb 28, 2007 at 05:03:37PM -0800, Jay Vosburgh wrote:
[...]
>My only concern is that this code assumes all mcast addresses stored in
>dev->mc-list list are for ipv4 igmp mcast addresses and nothing was done
>for ipv6.
>
>But this is much better than what we have now, so... 

	Agreed, but there's no IPv6 support anywhere in bonding at
present (for unicast or multicast), so this isn't really a loss.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01 17:05   ` Jay Vosburgh
@ 2007-03-01 19:25     ` Brian Haley
  2007-03-01 19:43       ` Andy Gospodarek
  0 siblings, 1 reply; 15+ messages in thread
From: Brian Haley @ 2007-03-01 19:25 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Andy Gospodarek, netdev, bonding-devel, Jeff Garzik

Jay Vosburgh wrote:
>> My only concern is that this code assumes all mcast addresses stored in
>> dev->mc-list list are for ipv4 igmp mcast addresses and nothing was done
>> for ipv6.
>>
>> But this is much better than what we have now, so... 
> 
> 	Agreed, but there's no IPv6 support anywhere in bonding at
> present (for unicast or multicast), so this isn't really a loss.

So forgive my naive question, but what would it take to make IPv6 work? 
  I know DAD fails on a test setup I have, but I haven't dug-into why 
this is (I can guess), and I'd like to see it working.  I'm willing to 
help, even if just to get it limping along.

-Brian

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01 19:25     ` Brian Haley
@ 2007-03-01 19:43       ` Andy Gospodarek
  2007-03-01 21:58         ` Jay Vosburgh
  2007-03-06 19:55         ` Brian Haley
  0 siblings, 2 replies; 15+ messages in thread
From: Andy Gospodarek @ 2007-03-01 19:43 UTC (permalink / raw)
  To: Brian Haley
  Cc: Jay Vosburgh, Andy Gospodarek, netdev, bonding-devel, Jeff Garzik

On Thu, Mar 01, 2007 at 02:25:19PM -0500, Brian Haley wrote:
> Jay Vosburgh wrote:
> >>My only concern is that this code assumes all mcast addresses stored in
> >>dev->mc-list list are for ipv4 igmp mcast addresses and nothing was done
> >>for ipv6.
> >>
> >>But this is much better than what we have now, so... 
> >
> >	Agreed, but there's no IPv6 support anywhere in bonding at
> >present (for unicast or multicast), so this isn't really a loss.
> 
> So forgive my naive question, but what would it take to make IPv6 work? 
>  I know DAD fails on a test setup I have, but I haven't dug-into why 
> this is (I can guess), and I'd like to see it working.  I'm willing to 
> help, even if just to get it limping along.
> 

Brian,

If we are easily able to differentiate between the multicast addresses
in the mc_list as to which are for ipv4 and which are for ipv6 then it
would be easy to call-out to something in the ipv6 mcast code when
needed instead of always calling out to ipv4 code.

-andy

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01 19:43       ` Andy Gospodarek
@ 2007-03-01 21:58         ` Jay Vosburgh
  2007-03-06 19:55         ` Brian Haley
  1 sibling, 0 replies; 15+ messages in thread
From: Jay Vosburgh @ 2007-03-01 21:58 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: Brian Haley, netdev, bonding-devel, Jeff Garzik

Andy Gospodarek <andy@greyhouse.net> wrote:

>On Thu, Mar 01, 2007 at 02:25:19PM -0500, Brian Haley wrote:
[...]
>> So forgive my naive question, but what would it take to make IPv6 work? 
>>  I know DAD fails on a test setup I have, but I haven't dug-into why 
>> this is (I can guess), and I'd like to see it working.  I'm willing to 
>> help, even if just to get it limping along.
>> 
>
>Brian,
>
>If we are easily able to differentiate between the multicast addresses
>in the mc_list as to which are for ipv4 and which are for ipv6 then it
>would be easy to call-out to something in the ipv6 mcast code when
>needed instead of always calling out to ipv4 code.

	Which covers multicast (at least for the failover case; I think
the regular support is already pretty address-independent).

	Additionally, the IP address tracking for the ARP monitor
(bond_glean_dev_ip) needs IPv6 address support.  I seem to recall that
there's an issue with the slaves each getting separate link local
addresses automatically assigned, but I haven't fooled with that in a
while.  There are likely other problems that would crop up during
serious testing.

	A characterization of what the IPv6 related problems are would
be a good place to start; I would expect that active-backup mode without
arp_monitor shouldn't be too difficult to make operable.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-01 19:43       ` Andy Gospodarek
  2007-03-01 21:58         ` Jay Vosburgh
@ 2007-03-06 19:55         ` Brian Haley
  2007-03-06 20:39           ` Jay Vosburgh
  1 sibling, 1 reply; 15+ messages in thread
From: Brian Haley @ 2007-03-06 19:55 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: Jay Vosburgh, netdev, bonding-devel

Andy Gospodarek wrote:
> If we are easily able to differentiate between the multicast addresses
> in the mc_list as to which are for ipv4 and which are for ipv6 then it
> would be easy to call-out to something in the ipv6 mcast code when
> needed instead of always calling out to ipv4 code.

I've been unable to figure out exactly what you're referring to in the 
code (bond_main.c), it seems to failover all multicast addresses, 
regardless of what address family they are.  I might have missed 
something in 4K lines of code though?

-Brian

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 19:55         ` Brian Haley
@ 2007-03-06 20:39           ` Jay Vosburgh
  2007-03-06 20:44             ` Andy Gospodarek
  0 siblings, 1 reply; 15+ messages in thread
From: Jay Vosburgh @ 2007-03-06 20:39 UTC (permalink / raw)
  To: Brian Haley; +Cc: Andy Gospodarek, netdev, bonding-devel

Brian Haley <brian.haley@hp.com> wrote:

>Andy Gospodarek wrote:
>> If we are easily able to differentiate between the multicast addresses
>> in the mc_list as to which are for ipv4 and which are for ipv6 then it
>> would be easy to call-out to something in the ipv6 mcast code when
>> needed instead of always calling out to ipv4 code.
>
>I've been unable to figure out exactly what you're referring to in the
>code (bond_main.c), it seems to failover all multicast addresses,
>regardless of what address family they are.  I might have missed something
>in 4K lines of code though?

	I believe Andy is talking about bond_resend_igmp_join_requests
being only effective for IGMP v4 and not IGMP v6.  The reason being that
there is (a) no discrimination between v4 and v6 multicast addresses,
and (b) for the v6 case, there's no "rejoin" type function as was
created for IPv4 with the patch.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 20:39           ` Jay Vosburgh
@ 2007-03-06 20:44             ` Andy Gospodarek
  2007-03-06 22:32               ` David Stevens
  0 siblings, 1 reply; 15+ messages in thread
From: Andy Gospodarek @ 2007-03-06 20:44 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Brian Haley, netdev, bonding-devel

On 3/6/07, Jay Vosburgh <fubar@us.ibm.com> wrote:
> Brian Haley <brian.haley@hp.com> wrote:
>
> >Andy Gospodarek wrote:
> >> If we are easily able to differentiate between the multicast addresses
> >> in the mc_list as to which are for ipv4 and which are for ipv6 then it
> >> would be easy to call-out to something in the ipv6 mcast code when
> >> needed instead of always calling out to ipv4 code.
> >
> >I've been unable to figure out exactly what you're referring to in the
> >code (bond_main.c), it seems to failover all multicast addresses,
> >regardless of what address family they are.  I might have missed something
> >in 4K lines of code though?
>
>         I believe Andy is talking about bond_resend_igmp_join_requests
> being only effective for IGMP v4 and not IGMP v6.  The reason being that
> there is (a) no discrimination between v4 and v6 multicast addresses,
> and (b) for the v6 case, there's no "rejoin" type function as was
> created for IPv4 with the patch.
>

/me nods

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 20:44             ` Andy Gospodarek
@ 2007-03-06 22:32               ` David Stevens
  2007-03-06 23:15                 ` [Bonding-devel] " Jay Vosburgh
  0 siblings, 1 reply; 15+ messages in thread
From: David Stevens @ 2007-03-06 22:32 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: bonding-devel, Brian Haley, fubar, netdev, netdev-owner

It looks to me like "rejoin" is essentially ip_mc_up(), and it'd be better
to call that than add a nearly identical function.

Also, real interfaces already do gratuitous IGMP advertisements when
they are bounced (the reason there is an ip_mc_up()). Could bonding,
when failing over, simply mark the master interface as down, switch, and
then mark the master as up again? In addition to doing the right
thing for both IPv4 and IPv6 multicasting w/o any code changes in those
layers, it may have similar benefits for ARP and neighbor discovery, 
right?
Maybe not-- haven't looked at it...

One down side for IPv6 (which apparently bonding doesn't support) is that
static addresses are lost when the device goes down, but that's a 
difference
form IPv4 that should be fixed.
                                                +-DLS


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Bonding-devel] [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 22:32               ` David Stevens
@ 2007-03-06 23:15                 ` Jay Vosburgh
  2007-03-07  0:45                   ` David Stevens
  2007-03-07  1:50                   ` Andy Gospodarek
  0 siblings, 2 replies; 15+ messages in thread
From: Jay Vosburgh @ 2007-03-06 23:15 UTC (permalink / raw)
  To: David Stevens
  Cc: Andy Gospodarek, netdev, Brian Haley, bonding-devel, netdev-owner


David Stevens <dlstevens@us.ibm.com> wrote:

>It looks to me like "rejoin" is essentially ip_mc_up(), and it'd be better
>to call that than add a nearly identical function.

	Won't ip_mc_up() acquire an additional reference (via
ip_mc_inc_group) to the IGMP_ALL_HOSTS im->users that would never be
released (in the case of bonding calling the function out of the blue)?

	In looking at it, the ip_mc_rejoin_group function (the new one
added with the patch) is a lot more like igmp_group_added() than
ip_mc_up().  I'm not sure if the extra bits in igmp_group_added() are
worthy of concern; I'm thinking not, since im->loaded shouldn't be zero
coming in for the bonding case.

	I think the meat that the "rejoin" wants is what's in
igmpv3_send_cr(), which appears to do the actual sending stuff.  I'm not
sure if that's better to call directly (and risk locking adventures) or
to just trip the timer via igmp_ifc_event().

	Anyway, it looks like all of this needs to be done under RTNL,
which isn't the case, so I need to go off and look into reworking it
again.

	Andy: do you have any work in progress on the sleep / rtnl stuff
we've been discussing?

>Also, real interfaces already do gratuitous IGMP advertisements when
>they are bounced (the reason there is an ip_mc_up()). Could bonding,
>when failing over, simply mark the master interface as down, switch, and
>then mark the master as up again? In addition to doing the right
>thing for both IPv4 and IPv6 multicasting w/o any code changes in those
>layers, it may have similar benefits for ARP and neighbor discovery, 
>right?

	Marking the master down would, I believe, issue notifiers that
the device has gone down.  Various things, network manager sort of
applications in particular, listen to those, so I'm not sure it's a good
idea.  I think there are other side effects as well, I'm thinking it
would flush routes associated with the interface as well.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Bonding-devel] [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 23:15                 ` [Bonding-devel] " Jay Vosburgh
@ 2007-03-07  0:45                   ` David Stevens
  2007-03-07  1:50                   ` Andy Gospodarek
  1 sibling, 0 replies; 15+ messages in thread
From: David Stevens @ 2007-03-07  0:45 UTC (permalink / raw)
  To: fubar; +Cc: Andy Gospodarek, bonding-devel, Brian Haley, netdev, netdev-owner

fubar@linux.vnet.ibm.com wrote on 03/06/2007 03:15:41 PM:

> 
> David Stevens <dlstevens@us.ibm.com> wrote:
> 
> >It looks to me like "rejoin" is essentially ip_mc_up(), and it'd be 
better
> >to call that than add a nearly identical function.
> 
>    Won't ip_mc_up() acquire an additional reference (via
> ip_mc_inc_group) to the IGMP_ALL_HOSTS im->users that would never be
> released (in the case of bonding calling the function out of the blue)?

        Yes. I'm not sure that matters-- destroy_dev doesn't care how many
references to a group, and IGMP_ALL_HOSTS isn't advertised (so wouldn't
get a "leave" when you only down the interface, like other groups do).
But since ip_mc_up() is *entirely* that join plus group_added() on all
the existing groups, there really shouldn't be another. But the new device
will need the all-hosts group in its hardware multicast filter, too, if
it hasn't already been using multicasting. Your "reload" caller could just
dec_group that group after calling ip_mc_up().

>    In looking at it, the ip_mc_rejoin_group function (the new one
> added with the patch) is a lot more like igmp_group_added() than
> ip_mc_up().
No, group_added() is one group. mc_up() just calls group_added on all
of them, which I think is what the rejoin was trying to do.

>I'm not sure if the extra bits in igmp_group_added() are
> worthy of concern; I'm thinking not, since im->loaded shouldn't be zero
> coming in for the bonding case.
        "im->loaded" means the device has it in its multicast address 
filter.
If you're switching devices, and didn't do all the multicast stuff on all
the devices originally, then you want it to be 0 (and should make it so,
like ip_mc_down() does). :-)

>    I think the meat that the "rejoin" wants is what's in
> igmpv3_send_cr(), which appears to do the actual sending stuff.  I'm not
> sure if that's better to call directly (and risk locking adventures) or
> to just trip the timer via igmp_ifc_event().

        No, no, no -- please don't mess with those directly. It'd be a 
maintenance
nightmare, and multicasting is device independent right now. :-) I'd hope 
there
wouldn't be any bonding-specific code needed at this layer, which is why I 
hope
something like using up/down would work out.

                                                                +-DLS


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Bonding-devel] [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-06 23:15                 ` [Bonding-devel] " Jay Vosburgh
  2007-03-07  0:45                   ` David Stevens
@ 2007-03-07  1:50                   ` Andy Gospodarek
  2007-03-07  3:21                     ` David Stevens
  1 sibling, 1 reply; 15+ messages in thread
From: Andy Gospodarek @ 2007-03-07  1:50 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: David Stevens, Andy Gospodarek, netdev, Brian Haley,
	bonding-devel, netdev-owner

On Tue, Mar 06, 2007 at 03:15:41PM -0800, Jay Vosburgh wrote:
> 
> David Stevens <dlstevens@us.ibm.com> wrote:
> 
> >It looks to me like "rejoin" is essentially ip_mc_up(), and it'd be better
> >to call that than add a nearly identical function.
> 
> 	Won't ip_mc_up() acquire an additional reference (via
> ip_mc_inc_group) to the IGMP_ALL_HOSTS im->users that would never be
> released (in the case of bonding calling the function out of the blue)?
> 
> 	In looking at it, the ip_mc_rejoin_group function (the new one
> added with the patch) is a lot more like igmp_group_added() than
> ip_mc_up().  I'm not sure if the extra bits in igmp_group_added() are
> worthy of concern; I'm thinking not, since im->loaded shouldn't be zero
> coming in for the bonding case.
> 
> 	I think the meat that the "rejoin" wants is what's in
> igmpv3_send_cr(), which appears to do the actual sending stuff.  I'm not
> sure if that's better to call directly (and risk locking adventures) or
> to just trip the timer via igmp_ifc_event().
> 
> 	Anyway, it looks like all of this needs to be done under RTNL,
> which isn't the case, so I need to go off and look into reworking it
> again.
> 
> 	Andy: do you have any work in progress on the sleep / rtnl stuff
> we've been discussing?

Jay, 

I do, but unfortunately it's much closer to the code I'd proposed
originally than the code you sent me.  The more I audited your code, the
more I like the design -- until I discovered that every time you pause
the timers you need to flush the workqueue.  This is bad since you are
regularly stopping the timers in places where the rtnl lock is taken
and the currently running work item may need to that lock to complete.
With a small enough monitor interval I could deadlock pretty quickly.
Without the benefit of the full stop, I couldn't justify the major
conversion just yet (plus is feels like keeping a list of the timers is
re-implementing what workqueues are designed to do for you).

I've got a patch that seems decent so far, but its really just at
timer->workqueue conversion with some bits thrown in correctly stop the
queues when taking the interface down or when removing the module.

> >Also, real interfaces already do gratuitous IGMP advertisements when
> >they are bounced (the reason there is an ip_mc_up()). Could bonding,
> >when failing over, simply mark the master interface as down, switch, and
> >then mark the master as up again? In addition to doing the right
> >thing for both IPv4 and IPv6 multicasting w/o any code changes in those
> >layers, it may have similar benefits for ARP and neighbor discovery, 
> >right?
> 
> 	Marking the master down would, I believe, issue notifiers that
> the device has gone down.  Various things, network manager sort of
> applications in particular, listen to those, so I'm not sure it's a good
> idea.  I think there are other side effects as well, I'm thinking it
> would flush routes associated with the interface as well.

I agree with Jay here.  I hate that bonding has to have so much
knowledge about upper layer protocols, but for the ones that are
stateful like IGMP we will need fixes like the one proposed.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Bonding-devel] [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-07  1:50                   ` Andy Gospodarek
@ 2007-03-07  3:21                     ` David Stevens
  2007-03-09 16:53                       ` Andy Gospodarek
  0 siblings, 1 reply; 15+ messages in thread
From: David Stevens @ 2007-03-07  3:21 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: Andy Gospodarek, bonding-devel, Brian Haley, fubar, netdev,
	netdev-owner

 > >    Marking the master down would, I believe, issue notifiers that
> > the device has gone down.  Various things, network manager sort of
> > applications in particular, listen to those, so I'm not sure it's a 
good
> > idea.  I think there are other side effects as well, I'm thinking it
> > would flush routes associated with the interface as well.

[BTW, you can call ip_mc_down()/ip_mc_up() directly w/o getting there
        from the notifiers -- then no side-effects.]

Andy Gospodarek wrote:
> 
> I agree with Jay here.  I hate that bonding has to have so much
> knowledge about upper layer protocols, but for the ones that are
> stateful like IGMP we will need fixes like the one proposed.

        I have no problem with bonding having knowledge of ULP's (I
don't like it, but I don't have to look at it :-) ), but the
patch is doing it the other way around. What I don't like about the
proposed patch is that it's adding knowledge of bonding to IGMP.
        And IGMP does work fine in this case, w/o flooding or the
proposed patch. It just has the risk of losing multicast packets
during one query interval, and that only happens if you're
using a switch that does IGMP snooping.
        I'd like the patch a lot better if it were basicly this:

mc_bond_fudge(void)
{
        ip_mc_down(masterdev);
/*do whatever you need to do to switch the slave */
        ip_mc_up(masterdev);
}

        That doesn't go through the notifier chain, uses existing
functions, doesn't have any refcnt issues, and most importantly
could/should reside in a bonding source file and not in igmp.c. :-)
        But RTNL is required whether you use up/down or roll your
own variant, so it sounds like you have other issues to resolve too.

                                                        +-DLS



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Bonding-devel] [PATCH 3/3] bonding: Improve IGMP join processing
  2007-03-07  3:21                     ` David Stevens
@ 2007-03-09 16:53                       ` Andy Gospodarek
  0 siblings, 0 replies; 15+ messages in thread
From: Andy Gospodarek @ 2007-03-09 16:53 UTC (permalink / raw)
  To: David Stevens
  Cc: Andy Gospodarek, bonding-devel, Brian Haley, fubar, netdev,
	netdev-owner

On Tue, Mar 06, 2007 at 07:21:30PM -0800, David Stevens wrote:
>  > >    Marking the master down would, I believe, issue notifiers that
> > > the device has gone down.  Various things, network manager sort of
> > > applications in particular, listen to those, so I'm not sure it's a 
> good
> > > idea.  I think there are other side effects as well, I'm thinking it
> > > would flush routes associated with the interface as well.
> 
> [BTW, you can call ip_mc_down()/ip_mc_up() directly w/o getting there
>         from the notifiers -- then no side-effects.]
> 

While this might seem like a nice solution if we think only about what
would cause the smallest change to igmp.c (since you just need to add a
small patch to export additional symbols), I shudder to think about the
disruption that this could cause in the network in some cases.

> Andy Gospodarek wrote:
> > 
> > I agree with Jay here.  I hate that bonding has to have so much
> > knowledge about upper layer protocols, but for the ones that are
> > stateful like IGMP we will need fixes like the one proposed.
> 
>         I have no problem with bonding having knowledge of ULP's (I
> don't like it, but I don't have to look at it :-) ), but the
> patch is doing it the other way around. What I don't like about the
> proposed patch is that it's adding knowledge of bonding to IGMP.

I disagree with this statement.  Why does adding an extra function to
aide in the transmission of additional igmp joins cause igmp to have
knowledge of bonding?

>         And IGMP does work fine in this case, w/o flooding or the
> proposed patch. It just has the risk of losing multicast packets
> during one query interval, and that only happens if you're
> using a switch that does IGMP snooping.

I feel like igmp snooping is a pretty widely used feature and I would
guess that it is used by anyone doing a/b bonding that has concerns
about igmp failover.

>         I'd like the patch a lot better if it were basicly this:
> 
> mc_bond_fudge(void)
> {
>         ip_mc_down(masterdev);
> /*do whatever you need to do to switch the slave */
>         ip_mc_up(masterdev);
> }
> 
>         That doesn't go through the notifier chain, uses existing
> functions, doesn't have any refcnt issues, and most importantly
> could/should reside in a bonding source file and not in igmp.c. :-)
>         But RTNL is required whether you use up/down or roll your
> own variant, so it sounds like you have other issues to resolve too.

The RTNL stuff would be all done work if Jay would just accept my
workqueue patch.  ;-)

> 
>                                                         +-DLS
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2007-03-09 16:54 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-03-01  1:03 [PATCH 3/3] bonding: Improve IGMP join processing Jay Vosburgh
2007-03-01 16:49 ` Andy Gospodarek
2007-03-01 17:05   ` Jay Vosburgh
2007-03-01 19:25     ` Brian Haley
2007-03-01 19:43       ` Andy Gospodarek
2007-03-01 21:58         ` Jay Vosburgh
2007-03-06 19:55         ` Brian Haley
2007-03-06 20:39           ` Jay Vosburgh
2007-03-06 20:44             ` Andy Gospodarek
2007-03-06 22:32               ` David Stevens
2007-03-06 23:15                 ` [Bonding-devel] " Jay Vosburgh
2007-03-07  0:45                   ` David Stevens
2007-03-07  1:50                   ` Andy Gospodarek
2007-03-07  3:21                     ` David Stevens
2007-03-09 16:53                       ` Andy Gospodarek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).