igmp: Allow mininum interval specification for igmp timers.

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* igmp: Allow mininum interval specification for igmp timers.
@ 2010-09-22 18:59 Christoph Lameter
       [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 18:59 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: Bob Arendt, David S. Miller, David L Stevens

IGMP timers sometimes fire too rapidly due to randomization of the
intervalsfrom 0 to max_delay in igmp_start_timer(). For some situations
(like the initial IGMP reports that are not responses to an IGMP query) we
do not want them in too rapid succession otherwise all the initial reports
may be lost due to a race conditions with the reconfiguration of the
routers and switches going on via the link layer (like on Infiniband). If
those are lost then the router will only discover that a new mc group was
joined when the igmp query was sent. General IGMP queries may be sent
rarely on large fabrics resulting in excessively long wait times until
data starts flowing. The application may abort before then concluding that
the network hardware is not operational.

The worst case scenario without the changes will send 3 igmp reports on join:

First		3 jiffies ("immediate" (spec) ~3 ms)
Second		3 jiffies (randomization leads to shortest interval) 3 ms
Third		3 jiffies (randomization leads to shortest interval) 3 ms

Which may result in a total of less than 10ms until the kernel gives up sending
igmp requests.

Change the IGMP layer to allow the specification of minimum and maximum delay.
Calculate the IGMP_Unsolicated_Report interval based on what the interval
before this patch would be on a 100HZ kernel. 3 jiffies at 100 HZ would result
in a mininum ~30 milliseconds spacing between the initial two IGMP reports.
Round it up to 40ms.

This will result in 3 initial unsolicited reports

First	"immediately"	3 jiffies (~ 3ms)
Second	randomized 40ms to 10seconds later
Third	randomized 40ms	to 10seconds later

So a mininum of ~83ms will pass before the unsolicted reports are
given up.

Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>

---
 net/ipv4/igmp.c |   45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

Index: linux-2.6/net/ipv4/igmp.c
===================================================================
--- linux-2.6.orig/net/ipv4/igmp.c	2010-09-22 11:15:19.000000000 -0500
+++ linux-2.6/net/ipv4/igmp.c	2010-09-22 12:50:32.000000000 -0500
@@ -116,10 +116,17 @@
 #define IGMP_V2_Router_Present_Timeout		(400*HZ)
 #define IGMP_Unsolicited_Report_Interval	(10*HZ)
 #define IGMP_Query_Response_Interval		(10*HZ)
-#define IGMP_Unsolicited_Report_Count		2

+/* Parameters not specified in igmp rfc. */
+
+/* Mininum ticks to have a meaningful notion of delay */
+#define IGMP_Mininum_Delay			(2)
+
+/* Control of unsolilcited reports (after join) */

+#define IGMP_Unsolicited_Report_Count		2
 #define IGMP_Initial_Report_Delay		(1)
+#define IGMP_Unsolicited_Report_Min_Delay	(HZ/25)

 /* IGMP_Initial_Report_Delay is not from IGMP specs!
  * IGMP specs require to report membership immediately after
@@ -174,22 +181,30 @@ static __inline__ void igmp_stop_timer(s
 	spin_unlock_bh(&im->lock);
 }

-/* It must be called with locked im->lock */
-static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+static inline unsigned long jiffies_rand_delay(int min_delay, int max_delay)
 {
-	int tv = net_random() % max_delay;
+	int d = min_delay;
+
+	if (min_delay < max_delay)
+		d += net_random() % (max_delay - min_delay);

+	return jiffies + d;
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int min_delay, int max_delay)
+{
 	im->tm_running = 1;
-	if (!mod_timer(&im->timer, jiffies+tv+2))
+	if (!mod_timer(&im->timer, jiffies_rand_delay(min_delay, max_delay)))
 		atomic_inc(&im->refcnt);
 }

 static void igmp_gq_start_timer(struct in_device *in_dev)
 {
-	int tv = net_random() % in_dev->mr_maxdelay;
-
 	in_dev->mr_gq_running = 1;
-	if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+	if (!mod_timer(&in_dev->mr_gq_timer,
+			jiffies_rand_delay(IGMP_Mininum_Delay,
+					in_dev->mr_maxdelay)))
 		in_dev_hold(in_dev);
 }

@@ -201,7 +216,7 @@ static void igmp_ifc_start_timer(struct
 		in_dev_hold(in_dev);
 }

-static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+static void igmp_mod_timer(struct ip_mc_list *im, int min_delay, int max_delay)
 {
 	spin_lock_bh(&im->lock);
 	im->unsolicit_count = 0;
@@ -214,7 +229,7 @@ static void igmp_mod_timer(struct ip_mc_
 		}
 		atomic_dec(&im->refcnt);
 	}
-	igmp_start_timer(im, max_delay);
+	igmp_start_timer(im, min_delay, max_delay);
 	spin_unlock_bh(&im->lock);
 }

@@ -733,7 +748,8 @@ static void igmp_timer_expire(unsigned l

 	if (im->unsolicit_count) {
 		im->unsolicit_count--;
-		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+		igmp_start_timer(im, IGMP_Unsolicited_Report_Min_Delay,
+				IGMP_Unsolicited_Report_Interval);
 	}
 	im->reporter = 1;
 	spin_unlock(&im->lock);
@@ -911,7 +927,7 @@ static void igmp_heard_query(struct in_d
 			igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
 		spin_unlock_bh(&im->lock);
 		if (changed)
-			igmp_mod_timer(im, max_delay);
+			igmp_mod_timer(im, IGMP_Mininum_Delay, max_delay);
 	}
 	read_unlock(&in_dev->mc_list_lock);
 }
@@ -1169,7 +1185,7 @@ static void igmp_group_added(struct ip_m
 		return;
 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
 		spin_lock_bh(&im->lock);
-		igmp_start_timer(im, IGMP_Initial_Report_Delay);
+		igmp_start_timer(im, IGMP_Mininum_Delay, IGMP_Initial_Report_Delay);
 		spin_unlock_bh(&im->lock);
 		return;
 	}
@@ -1258,7 +1274,8 @@ void ip_mc_rejoin_group(struct ip_mc_lis
 		return;

 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
-		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
+		igmp_mod_timer(im, IGMP_Mininum_Delay,
+					IGMP_Initial_Report_Delay);
 		return;
 	}
 	/* else, v3 */
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-22 19:01   ` Christoph Lameter
       [not found]     ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  2010-09-24  4:38   ` igmp: Allow mininum interval specification for igmp timers David Miller
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 19:01 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: Bob Arendt, David S. Miller, David L Stevens

The earlier patch added an initial mininum latency and got us up to
~80ms. However, there are large networks that take longer to configure
multicast paths.

This patch changes the behavior for unsolicited igmp reports to ensure
that even sporadic loss of the initial IGMP reports will result in a
reasonable fast subscription.

The rfc states that the first igmp report should be sent immediately and
then mentions that a couple of more should be sent but does not specify
exactly how the repeating of the igmp reports should occur. The RFC
suggests that the behavior in response to an IGMP report (randomized
response 0-max response time) could be followed but we have seen issues
with this suggestion since the intervals can be very short. There is also
no reason to randomize since the unsolicited reports are not a response to
an igmp query but the result of a join request in the code.

The patch here establishes more fixed delays for sending unsolicited
igmp reports after join. There is still a fuzz factor associated but the
sending of the igmp reports follows more tightly a set of intervals and sends
up to 7 igmp reports.

IGMP Report	Time delay
------------------------------------------------------------
0		3 ticks		"immediate" accordig to RFC.
1		40ms
2		200ms
3		1sec
4		5sec
5		10sec
6		60sec

So unsolicited reports are send for an interval of at least a minute
(reports are aborted if igmp reports or other info is seen).

Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>

---
 net/ipv4/igmp.c |   38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

Index: linux-2.6/net/ipv4/igmp.c
===================================================================
--- linux-2.6.orig/net/ipv4/igmp.c	2010-09-22 12:50:32.000000000 -0500
+++ linux-2.6/net/ipv4/igmp.c	2010-09-22 13:32:58.000000000 -0500
@@ -124,17 +124,40 @@

 /* Control of unsolilcited reports (after join) */

-#define IGMP_Unsolicited_Report_Count		2
+#define IGMP_Unsolicited_Report_Count		6
 #define IGMP_Initial_Report_Delay		(1)
 #define IGMP_Unsolicited_Report_Min_Delay	(HZ/25)
+#define IGMP_Unsolicited_Fuzz			(HZ/100)
+

 /* IGMP_Initial_Report_Delay is not from IGMP specs!
  * IGMP specs require to report membership immediately after
  * joining a group, but we delay the first report by a
  * small interval. It seems more natural and still does not
  * contradict to specs provided this delay is small enough.
+ *
+ * The spec does not say how the initial igmp reports
+ * need to be repeated (aside from suggesting to just do the
+ * randomization of the intervals as for igmp queries but then
+ * there is no centralized trigger and therefore no randomization
+ * needed). We provide an array of delays here that are likely
+ * to work in general avoiding the often too short or too long intervals
+ * that would be generated if we would follow the suggestion in the rfc.
+ *
+ * Note that the sending of unsolicited reports may stop at any point
+ * if we see an igmp query from a router or a neighbors ignmp report.
  */

+static int unsolicited_delay[IGMP_Unsolicited_Report_Count + 1] = {
+	IGMP_Initial_Report_Delay + IGMP_Mininum_Delay,	/* "Immediate" */
+	HZ / 25,			/* 40ms  */
+	HZ / 5,				/* 200ms */
+	HZ,
+	5 * HZ,
+	10 * HZ,
+	60 * HZ
+};
+
 #define IGMP_V1_SEEN(in_dev) \
 	(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
 	 IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
@@ -199,6 +222,13 @@ static void igmp_start_timer(struct ip_m
 		atomic_inc(&im->refcnt);
 }

+static void igmp_start_initial_timer(struct ip_mc_list *im, int interval)
+{
+	int delay = unsolicited_delay[interval];
+
+	igmp_start_timer(im, delay, delay + IGMP_Unsolicited_Fuzz);
+}
+
 static void igmp_gq_start_timer(struct in_device *in_dev)
 {
 	in_dev->mr_gq_running = 1;
@@ -748,8 +778,8 @@ static void igmp_timer_expire(unsigned l

 	if (im->unsolicit_count) {
 		im->unsolicit_count--;
-		igmp_start_timer(im, IGMP_Unsolicited_Report_Min_Delay,
-				IGMP_Unsolicited_Report_Interval);
+		igmp_start_initial_timer(im,
+			IGMP_Unsolicited_Report_Count - im->unsolicit_count);
 	}
 	im->reporter = 1;
 	spin_unlock(&im->lock);
@@ -1185,7 +1215,7 @@ static void igmp_group_added(struct ip_m
 		return;
 	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
 		spin_lock_bh(&im->lock);
-		igmp_start_timer(im, IGMP_Mininum_Delay, IGMP_Initial_Report_Delay);
+		igmp_start_initial_timer(im, 0);
 		spin_unlock_bh(&im->lock);
 		return;
 	}

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]     ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-22 19:30       ` David Stevens
       [not found]         ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-22 19:30 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/22/2010 12:01:28 PM:

> The earlier patch added an initial mininum latency and got us up to
> ~80ms. However, there are large networks that take longer to configure
> multicast paths.

        I feel your pain, but the protocol allows this to be 0 and all
of the unsolicited reports can be lost. I don't think adding a minimum
latency solves a general problem. Perhaps the device should queue some
packets if it isn't ready quickly? A querier is what makes these
reliable, but for the start-up in particular, I think it'd be better
to not initiate the send on devices that have this problem until the
device is actually ready to send-- why not put the delay in the device
driver on initialization?
 
> with this suggestion since the intervals can be very short. There is 
also
> no reason to randomize since the unsolicited reports are not a response 
to
> an igmp query but the result of a join request in the code.

        These are also staggered to prevent a storm by mass reboots, e.g.,
from a power outage, and the default groups are joined on interface
bring-up.


> The patch here establishes more fixed delays for sending unsolicited
> igmp reports after join. There is still a fuzz factor associated but the
> sending of the igmp reports follows more tightly a set of intervals and 
sends
> up to 7 igmp reports.
> 
> IGMP Report   Time delay
> ------------------------------------------------------------
> 0      3 ticks      "immediate" accordig to RFC.
> 1      40ms
> 2      200ms
> 3      1sec
> 4      5sec
> 5      10sec
> 6      60sec
> 
> So unsolicited reports are send for an interval of at least a minute
> (reports are aborted if igmp reports or other info is seen).

        This is outside the protocol spec, and the intervals are neither
random nor scaled based on any network performance metric.

1) I'm not sure there's a problem here to solve, other than for your
        particular hardware.
2) I think this would better be solved in the driver-- don't do the
        upper initialization and group joins until the sends can actually
        succeed.
3) I don't think it's a good idea to make up intervals, and especially
        non-randomized ones. The probability of getting all minimum 
intervals
        is very low (which goes back to #1) and sending fixed intervals 
may
        introduce a problem (packet storms) that isn't there per RFC. 
These
        fixed intervals can also be either way too long or way too short,
        depending on link characteristics they don't account for. Leaving
        the intervals randomized based on querier-supplied data seems much
        more appropriate to me.


                                                                +-DLS

 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]         ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-09-22 19:58           ` Christoph Lameter
       [not found]             ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  2010-09-22 20:56             ` Bob Arendt
  0 siblings, 2 replies; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 19:58 UTC (permalink / raw)
  To: David Stevens
  Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

On Wed, 22 Sep 2010, David Stevens wrote:

>         I feel your pain, but the protocol allows this to be 0 and all
> of the unsolicited reports can be lost. I don't think adding a minimum
> latency solves a general problem. Perhaps the device should queue some

The protocol does not specificy the intervals during unsolicited igmp
sends. It only specifies the intervals as a result of a igmp query.

> packets if it isn't ready quickly? A querier is what makes these
> reliable, but for the start-up in particular, I think it'd be better
> to not initiate the send on devices that have this problem until the
> device is actually ready to send-- why not put the delay in the device
> driver on initialization?

The device is ready. Its just the multicast group that has not been
established yet.

> > an igmp query but the result of a join request in the code.
>
>         These are also staggered to prevent a storm by mass reboots, e.g.,
> from a power outage, and the default groups are joined on interface
> bring-up.

There is still some staggering left (see IGMP_Unsolicited_Fuzz). I can
increase that if necessary.

There also cannot be any storm since any unsolicited igmp report by any
system will stop the unsolicited igmp reports by any other system.

> > So unsolicited reports are send for an interval of at least a minute
> > (reports are aborted if igmp reports or other info is seen).
>
>         This is outside the protocol spec, and the intervals are neither
> random nor scaled based on any network performance metric.

Where does it say that in the spec? Again this is an *unsolicited* igmp
report.

> 2) I think this would better be solved in the driver-- don't do the
>         upper initialization and group joins until the sends can actually
>         succeed.

The driver is fine. Its just the multicast path in the network that take
time to establish.

> 3) I don't think it's a good idea to make up intervals, and especially
>         non-randomized ones. The probability of getting all minimum
> intervals
>         is very low (which goes back to #1) and sending fixed intervals
> may
>         introduce a problem (packet storms) that isn't there per RFC.
> These
>         fixed intervals can also be either way too long or way too short,
>         depending on link characteristics they don't account for. Leaving
>         the intervals randomized based on querier-supplied data seems much
>         more appropriate to me.

These are *unsolicited* igmp reports. There is *no* querier supplied data
yet. The first querier supplied data (or any other unsolicited igmp
report) will immediately stop the unsolicited reports and then will
continue to respond in randomized intervals based on the data that the
querier has supplied.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]             ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-22 20:36               ` David Stevens
  2010-09-22 21:26                 ` Christoph Lameter
  2010-09-22 21:50               ` Jason Gunthorpe
  1 sibling, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-22 20:36 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote:
> 
> On Wed, 22 Sep 2010, David Stevens wrote:
> 
> >         I feel your pain, but the protocol allows this to be 0 and all
> > of the unsolicited reports can be lost. I don't think adding a minimum
> > latency solves a general problem. Perhaps the device should queue some
> 
> The protocol does not specificy the intervals during unsolicited igmp
> sends. It only specifies the intervals as a result of a igmp query.

RFC 3376:
"  To cover the possibility of the State-Change Report being missed by
   one or more multicast routers, it is retransmitted [Robustness
   Variable] - 1 more times, at intervals chosen at random from the
   range (0, [Unsolicited Report Interval])."
and
"8.11. Unsolicited Report Interval

   The Unsolicited Report Interval is the time between repetitions of a
   host's initial report of membership in a group.  Default: 1 second."

> The device is ready. Its just the multicast group that has not been
> established yet.
        Well, if you know that's going to happen with your device, then
again, why not queue them on start up until you have indication that
the group has been established, or delay in the driver.
        You're changing IGMP for all device types to fix a problem in
only one.
 
> There also cannot be any storm since any unsolicited igmp report by any
> system will stop the unsolicited igmp reports by any other system.

        Not if they are simultaneous, which is exactly when it is a 
problem. :-)
> 
> > > So unsolicited reports are send for an interval of at least a minute
> > > (reports are aborted if igmp reports or other info is seen).
> >
> >         This is outside the protocol spec, and the intervals are 
neither
> > random nor scaled based on any network performance metric.
> 
> Where does it say that in the spec? Again this is an *unsolicited* igmp
> report.

        See quotes above.
 
> These are *unsolicited* igmp reports. There is *no* querier supplied 
data
> yet. The first querier supplied data (or any other unsolicited igmp
> report) will immediately stop the unsolicited reports and then will
> continue to respond in randomized intervals based on the data that the
> querier has supplied.

        There are initial values, which are currently constants, but it'd
be (more) reasonable to turn those into per-interface tunables or
per-interface initial values with IB interfaces using larger values.

IGMP_Unsolicited_Report_Count (default 2)
IGMP_Unsolicited_Report_Interval (default 10secs which is 10x larger,as
        you want, than the RFC suggests).

                                                                +-DLS

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-22 19:58           ` Christoph Lameter
       [not found]             ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-22 20:56             ` Bob Arendt
       [not found]               ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: Bob Arendt @ 2010-09-22 20:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Stevens, David S. Miller, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org

On 09/22/2010 12:58 PM, Christoph Lameter wrote:
> On Wed, 22 Sep 2010, David Stevens wrote:
>> 3) I don't think it's a good idea to make up intervals, and especially
>>          non-randomized ones. The probability of getting all minimum
>> intervals
>>          is very low (which goes back to #1) and sending fixed intervals
>> may
>>          introduce a problem (packet storms) that isn't there per RFC.
>> These
>>          fixed intervals can also be either way too long or way too short,
>>          depending on link characteristics they don't account for. Leaving
>>          the intervals randomized based on querier-supplied data seems much
>>          more appropriate to me.
>
> These are *unsolicited* igmp reports. There is *no* querier supplied data
> yet. The first querier supplied data (or any other unsolicited igmp
> report) will immediately stop the unsolicited reports and then will
> continue to respond in randomized intervals based on the data that the
> querier has supplied.
>
>

There certainly seems to be some backing for part of Christoph's concept in
the IETF rfc's.  I've posted the relevant sections below.  IGMPv2 doesn't specify
a limit on retransmissions of an unsolicited Join, only that they stop once
multicast traffic is received. While IGMPv2 defines an "Unsolicited Report
Interval" default of 10 seconds, it appears that this is a significant enough
issue that the later IGMPv3 document calls out a default of 1 second, and
goes on to define a "Robustness Variable" and talks about the same case that
Christoph is trying to mitigate.

However, both rfc's *do* specify that the random timers should be used based
on a value called the "unsolicited report interval".

Perhaps implementing the IGMPv3 capability with kernel parameters for an
"unsolicited report interval" and "robustness variable" would satisfy
Christoph's issue?

-Bob Arendt

rfc2236 IGMPv2  =============================
Section 3 .... page 4 para 2
    When a host joins a multicast group, it should immediately transmit
    an unsolicited Version 2 Membership Report for that group, in case it
    is the first member of that group on the network.  To cover the
    possibility of the initial Membership Report being lost or damaged,
    it is recommended that it be repeated once or twice after short
    delays [Unsolicited Report Interval].

Section 6 ...  page 8 para 4
- "start timer" for the group on the interface, using a delay value
      chosen uniformly from the interval (0, Max Response Time], where
      Max Response time is specified in the Query.  If this is an
      unsolicited Report, the timer is set to a delay value chosen
      uniformly from the interval (0, [Unsolicited Report Interval] ].

8.10.  Unsolicited Report Interval  (page 18)
    The Unsolicited Report Interval is the time between repetitions of a
    host's initial report of membership in a group.  Default: 10 seconds.

rfc3376 IGMPv3  ============================
Section 5.1 page 19, near end
    (note - unsolicited Join is a type of State-Change report)
    To cover the possibility of the State-Change Report being missed by
    one or more multicast routers, it is retransmitted [Robustness
    Variable] - 1 more times, at intervals chosen at random from the
    range (0, [Unsolicited Report Interval]).

8.11. Unsolicited Report Interval  (page 41)
    The Unsolicited Report Interval is the time between repetitions of a
    host's initial report of membership in a group.  Default: 1 second.

8.1. Robustness Variable (page 39)
    The Robustness Variable allows tuning for the expected packet loss on
    a network.  If a network is expected to be lossy, the Robustness
    Variable may be increased.  IGMP is robust to (Robustness Variable -
    1) packet losses.  The Robustness Variable MUST NOT be zero, and
    SHOULD NOT be one.  Default: 2

8.14.1. Robustness Variable  (page 41/42)
    The Robustness Variable tunes IGMP to expected losses on a link.
    IGMPv3 is robust to (Robustness Variable - 1) packet losses, e.g., if
    the Robustness Variable is set to the default value of 2, IGMPv3 is
    robust to a single packet loss but may operate imperfectly if more
    losses occur.  On lossy subnetworks, the Robustness Variable should
    be increased to allow for the expected level of packet loss. However,
    increasing the Robustness Variable increases the leave latency of the
    subnetwork.  (The leave latency is the time between when the last
    member stops listening to a source or group and when the traffic
    stops flowing.)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-22 20:36               ` David Stevens
@ 2010-09-22 21:26                 ` Christoph Lameter
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 21:26 UTC (permalink / raw)
  To: David Stevens; +Cc: David S. Miller, linux-rdma, netdev, Bob Arendt

On Wed, 22 Sep 2010, David Stevens wrote:

> > The protocol does not specificy the intervals during unsolicited igmp
> > sends. It only specifies the intervals as a result of a igmp query.
>
> RFC 3376:
> "  To cover the possibility of the State-Change Report being missed by
>    one or more multicast routers, it is retransmitted [Robustness
>    Variable] - 1 more times, at intervals chosen at random from the
>    range (0, [Unsolicited Report Interval])."
> and
> "8.11. Unsolicited Report Interval
>
>    The Unsolicited Report Interval is the time between repetitions of a
>    host's initial report of membership in a group.  Default: 1 second."


Hmmm looks like I looked at the earlier RFC 2236 3) (was not really
interested in IGMP v3, IGMPv2 is run).

   When a host joins a multicast group, it should immediately transmit
   an unsolicited Version 2 Membership Report for that group, in case it
   is the first member of that group on the network.  To cover the
   possibility of the initial Membership Report being lost or damaged,
   it is recommended that it be repeated once or twice after short
   delays [Unsolicited Report Interval].  (A simple way to accomplish
   this is to send the initial Version 2 Membership Report and then act
   as if a Group-Specific Query was received for that group, and set a
   timer appropriately).

The new Unsolicited Report Interval is promising. We need to support that?

> > The device is ready. Its just the multicast group that has not been
> > established yet.
>         Well, if you know that's going to happen with your device, then
> again, why not queue them on start up until you have indication that
> the group has been established, or delay in the driver.
>         You're changing IGMP for all device types to fix a problem in
> only one.
>
> > There also cannot be any storm since any unsolicited igmp report by any
> > system will stop the unsolicited igmp reports by any other system.
>
>         Not if they are simultaneous, which is exactly when it is a
> problem. :-)

But then they are not simulateneous since there is a fuzz factor.

> > These are *unsolicited* igmp reports. There is *no* querier supplied
> data
> > yet. The first querier supplied data (or any other unsolicited igmp
> > report) will immediately stop the unsolicited reports and then will
> > continue to respond in randomized intervals based on the data that the
> > querier has supplied.
>
>         There are initial values, which are currently constants, but it'd
> be (more) reasonable to turn those into per-interface tunables or
> per-interface initial values with IB interfaces using larger values.
>
> IGMP_Unsolicited_Report_Count (default 2)
> IGMP_Unsolicited_Report_Interval (default 10secs which is 10x larger,as
>         you want, than the RFC suggests).

Ahhh... Interesting..... 1 second now? That is much better and would avoid
long drawn out joins due to the long delays.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]               ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org>
@ 2010-09-22 21:33                 ` Christoph Lameter
  2010-09-22 21:41                   ` David Stevens
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 21:33 UTC (permalink / raw)
  To: Bob Arendt
  Cc: David Stevens, David S. Miller,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Wed, 22 Sep 2010, Bob Arendt wrote:

> multicast traffic is received. While IGMPv2 defines an "Unsolicited Report
> Interval" default of 10 seconds, it appears that this is a significant enough
> issue that the later IGMPv3 document calls out a default of 1 second, and
> goes on to define a "Robustness Variable" and talks about the same case that
> Christoph is trying to mitigate.

Actually that suggests a different way to reach the same goal:


Subject: igmp: Make unsolicited report interval conform to RFC3376

RFC3376 specifies a shorter time interval for sending igmp joins.
This can address issues where joins are slow because the initial join is
frequently lost.

Also increment the frequency so that we get a 10 reports send over a
few seconds.

Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>


---
 net/ipv4/igmp.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux-2.6/net/ipv4/igmp.c
===================================================================
--- linux-2.6.orig/net/ipv4/igmp.c	2010-09-22 16:28:17.000000000 -0500
+++ linux-2.6/net/ipv4/igmp.c	2010-09-22 16:28:54.000000000 -0500
@@ -114,9 +114,9 @@

 #define IGMP_V1_Router_Present_Timeout		(400*HZ)
 #define IGMP_V2_Router_Present_Timeout		(400*HZ)
-#define IGMP_Unsolicited_Report_Interval	(10*HZ)
+#define IGMP_Unsolicited_Report_Interval	(HZ)
 #define IGMP_Query_Response_Interval		(10*HZ)
-#define IGMP_Unsolicited_Report_Count		2
+#define IGMP_Unsolicited_Report_Count		10


 #define IGMP_Initial_Report_Delay		(1)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-22 21:33                 ` Christoph Lameter
@ 2010-09-22 21:41                   ` David Stevens
  2010-09-23 15:37                     ` Christoph Lameter
  0 siblings, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-22 21:41 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David S. Miller, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org, Bob Arendt

Christoph Lameter <cl@linux.com> wrote on 09/22/2010 02:33:14 PM:

 This can address issues where joins are slow because the initial join is
> frequently lost.
> 
> Also increment the frequency so that we get a 10 reports send over a
> few seconds.

        Except you want to conform and not conform at the same time. :-)
IGMPv2 should be: default count 2, interval 10secs
IGMPv3 should be: default count 2, interval 1sec

...and no way is it a good idea to send 10 unsolicited reports on an
Ethernet.

I think system-wide defaults must be as suggested (which allows for
v3 being shortened to 1sec, but not v2) and if you want to use longer
values, you should have either a *per-interface* tunable [ie, the default
value for your interface only] or make these per-interface variables and
have the IB code bump them up for IB interfaces only. An attached
Ethernet on the same system shouldn't be using larger values unless
bumped for some reason by an administrator.

There is no problem with current values on Ethernet; lets not create
one. :-)

                                                                +-DLS

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]             ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  2010-09-22 20:36               ` David Stevens
@ 2010-09-22 21:50               ` Jason Gunthorpe
  2010-09-23 15:32                 ` Christoph Lameter
  1 sibling, 1 reply; 26+ messages in thread
From: Jason Gunthorpe @ 2010-09-22 21:50 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Stevens, David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

On Wed, Sep 22, 2010 at 02:58:15PM -0500, Christoph Lameter wrote:
> > packets if it isn't ready quickly? A querier is what makes these
> > reliable, but for the start-up in particular, I think it'd be better
> > to not initiate the send on devices that have this problem until the
> > device is actually ready to send-- why not put the delay in the device
> > driver on initialization?
> 
> The device is ready. Its just the multicast group that has not been
> established yet.

In IB when the SA replies to a group join the group should be ready,
prior to that the device can't send into the group because it has no
MLID for the group.. If you have a MLID then the group is working.

Is the issue you are dropping IGMP packets because the 224.0.0.2 join
hasn't finished? Ideally you'd wait for the SA to reply before sending
a IGMP, but a simpler solution might just be to use the broadcast MLID
for packets addressed to a MGID that has not yet got a MLID. This
would bebe similar to the ethernet behaviour of flooding.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-22 21:50               ` Jason Gunthorpe
@ 2010-09-23 15:32                 ` Christoph Lameter
  2010-09-23 17:26                   ` Jason Gunthorpe
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-23 15:32 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: David Stevens, David S. Miller, linux-rdma, netdev, Bob Arendt

On Wed, 22 Sep 2010, Jason Gunthorpe wrote:

> > The device is ready. Its just the multicast group that has not been
> > established yet.
>
> In IB when the SA replies to a group join the group should be ready,
> prior to that the device can't send into the group because it has no
> MLID for the group.. If you have a MLID then the group is working.

When the SA replies it has created the MLID but not reconfigured the
fabric yet. So the initial IGMP messages get lost.

> Is the issue you are dropping IGMP packets because the 224.0.0.2 join
> hasn't finished? Ideally you'd wait for the SA to reply before sending
> a IGMP, but a simpler solution might just be to use the broadcast MLID
> for packets addressed to a MGID that has not yet got a MLID. This
> would bebe similar to the ethernet behaviour of flooding.

IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2
is only used when leaving a multicast group.

I thought also about solutions along the same lines. We could modify the
IB layer to send to 224.0.0.2 while until the SA has confirmed the
creation of the MC group. For that to work we first would need to modify
the SA logic to ensure that it only sends confirmation *after* the fabric
has been reconfigured. Then we need to switch the MLIDs of the MC group
when the notification is received.

If the IB layer has not joined 224.0.0.2 yet (and it will take awhile)
then we could even fallback to broadcast until its ready.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-22 21:41                   ` David Stevens
@ 2010-09-23 15:37                     ` Christoph Lameter
       [not found]                       ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-23 15:37 UTC (permalink / raw)
  To: David Stevens
  Cc: David S. Miller, linux-rdma@vger.kernel.org,
	netdev@vger.kernel.org, Bob Arendt

On Wed, 22 Sep 2010, David Stevens wrote:

> >
> > Also increment the frequency so that we get a 10 reports send over a
> > few seconds.
>
>         Except you want to conform and not conform at the same time. :-)
> IGMPv2 should be: default count 2, interval 10secs
> IGMPv3 should be: default count 2, interval 1sec

This is during the period of unsolicited igmp reports. We do not know if
this group is managed using V3 or V2 since no igmp query/report has been
received yet.

> ...and no way is it a good idea to send 10 unsolicited reports on an
> Ethernet.

Why would that be an issue?

The IGMPv2 RFC has no strict limit and RFC3376
mentions that the retransmission occurs "Robustness Variable" times
minus one. Choosing 10 for the "Robustness Variable" is certainly ok.

If we do not increase the number of reports but just limit the interval
then the chance of outages of a second or so during mc group creation
causing routers missing igmp reports is significantly increased.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
  2010-09-23 15:32                 ` Christoph Lameter
@ 2010-09-23 17:26                   ` Jason Gunthorpe
       [not found]                     ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Jason Gunthorpe @ 2010-09-23 17:26 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: David Stevens, linux-rdma, netdev, Bob Arendt

On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote:

> > Is the issue you are dropping IGMP packets because the 224.0.0.2 join
> > hasn't finished? Ideally you'd wait for the SA to reply before sending
> > a IGMP, but a simpler solution might just be to use the broadcast MLID
> > for packets addressed to a MGID that has not yet got a MLID. This
> > would bebe similar to the ethernet behaviour of flooding.
> 
> IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2
> is only used when leaving a multicast group.

Hm, that is quite different than in IGMPv3.. How does this work at all
in IB? A message to the multicast group isn't going to make it to any
routers unless the routers use some other means to join the IB MGID.

Jason

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]                     ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-09-23 17:37                       ` Christoph Lameter
       [not found]                         ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-23 17:37 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

On Thu, 23 Sep 2010, Jason Gunthorpe wrote:

> On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote:
>
> > > Is the issue you are dropping IGMP packets because the 224.0.0.2 join
> > > hasn't finished? Ideally you'd wait for the SA to reply before sending
> > > a IGMP, but a simpler solution might just be to use the broadcast MLID
> > > for packets addressed to a MGID that has not yet got a MLID. This
> > > would bebe similar to the ethernet behaviour of flooding.
> >
> > IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2
> > is only used when leaving a multicast group.
>
> Hm, that is quite different than in IGMPv3.. How does this work at all
> in IB? A message to the multicast group isn't going to make it to any
> routers unless the routers use some other means to join the IB MGID.

IPoIB creates a infiniband multicast group via the IB calls for a IP
multicast group. Then IGMP comes into play and the kernel sends the IP
based igmp report. This igmp report must be received by an outside router
(on an IP network) in order to for traffic to get forwarded into the IB
fabric. You can end up with a IB multicast configuration that is all fine
but with loss of the unsolicited packets due to fabric reconfiguration not
being complete yet. The larger the fabric the worse the situation.

If all unsolicited igmp reports are lost then the router will
only start forwarding the mc group after the reporting intervals
(which could be in the range of minutes) when it triggers igmp reports
through a general igmp query. Until that time the MC group looks dead. And
people and software may conclude that the **** network is broken.

This is a general issue for any network where configurations for MC
forwarding is needed and where initial igmp reports may get lost. A
staggering of time intervals would be a general solution to that issue.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]                         ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-23 17:46                           ` Jason Gunthorpe
       [not found]                             ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2010-09-27 19:32                           ` David Stevens
  1 sibling, 1 reply; 26+ messages in thread
From: Jason Gunthorpe @ 2010-09-23 17:46 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

On Thu, Sep 23, 2010 at 12:37:28PM -0500, Christoph Lameter wrote:
> On Thu, 23 Sep 2010, Jason Gunthorpe wrote:
> 
> > On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote:
> >
> > > > Is the issue you are dropping IGMP packets because the 224.0.0.2 join
> > > > hasn't finished? Ideally you'd wait for the SA to reply before sending
> > > > a IGMP, but a simpler solution might just be to use the broadcast MLID
> > > > for packets addressed to a MGID that has not yet got a MLID. This
> > > > would bebe similar to the ethernet behaviour of flooding.
> > >
> > > IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2
> > > is only used when leaving a multicast group.
> >
> > Hm, that is quite different than in IGMPv3.. How does this work at all
> > in IB? A message to the multicast group isn't going to make it to any
> > routers unless the routers use some other means to join the IB MGID.
> 
> IPoIB creates a infiniband multicast group via the IB calls for a IP
> multicast group. Then IGMP comes into play and the kernel sends the IP
> based igmp report. This igmp report must be received by an outside router
> (on an IP network) in order to for traffic to get forwarded into the IB
> fabric. You can end up with a IB multicast configuration that is all fine
> but with loss of the unsolicited packets due to fabric reconfiguration not
> being complete yet. The larger the fabric the worse the situation.

But my point is that IB has very limited multicast, if I create a IB
group and then send IGMP into that group *it will not reach a router*.

I have to send something to the all routers group or the all IGMPv3
group to get it to reach a router with any reliably.

The only way this kind of scheme could work is if an IGMPv2 IPoIB
router listens for IB MGID Create notices from the SA and
automatically joins all groups that are created, so it can get IGMPv2
membership reports. Which obviously adds more delay, lag, and risk.

I'm *guessing* that the change in IGMPv3 to send reports to 224.0.0.22
(all IGMPv3 multicast address) is related to this sort of problem, and
it seems like on IB IGMPv2 is not a good fit and should not be used if
v3 is available..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]                             ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-09-23 17:56                               ` Christoph Lameter
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2010-09-23 17:56 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

On Thu, 23 Sep 2010, Jason Gunthorpe wrote:

> > IPoIB creates a infiniband multicast group via the IB calls for a IP
> > multicast group. Then IGMP comes into play and the kernel sends the IP
> > based igmp report. This igmp report must be received by an outside router
> > (on an IP network) in order to for traffic to get forwarded into the IB
> > fabric. You can end up with a IB multicast configuration that is all fine
> > but with loss of the unsolicited packets due to fabric reconfiguration not
> > being complete yet. The larger the fabric the worse the situation.
>
> But my point is that IB has very limited multicast, if I create a IB
> group and then send IGMP into that group *it will not reach a router*.

The IPoIB routers automatically join all IP MC groups created.

> The only way this kind of scheme could work is if an IGMPv2 IPoIB
> router listens for IB MGID Create notices from the SA and
> automatically joins all groups that are created, so it can get IGMPv2
> membership reports. Which obviously adds more delay, lag, and risk.

Right that is how it works now.

> I'm *guessing* that the change in IGMPv3 to send reports to 224.0.0.22
> (all IGMPv3 multicast address) is related to this sort of problem, and
> it seems like on IB IGMPv2 is not a good fit and should not be used if
> v3 is available..

Existing routers do no support IGMPv3.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  2010-09-22 19:01   ` igmp: Staggered igmp report intervals for unsolicited igmp reports Christoph Lameter
@ 2010-09-24  4:38   ` David Miller
       [not found]     ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
  1 sibling, 1 reply; 26+ messages in thread
From: David Miller @ 2010-09-24  4:38 UTC (permalink / raw)
  To: cl-vYTEC60ixJUAvxtiuMwx3w
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	rda-x0S3BwdUo6DQT0dZR+AlfA, dlstevens-r/Jw6+rmf7HQT0dZR+AlfA

From: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
Date: Wed, 22 Sep 2010 13:59:30 -0500 (CDT)

> IGMP timers sometimes fire too rapidly due to randomization of the
> intervalsfrom 0 to max_delay in igmp_start_timer().
 ...
> Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>


This change seems reasonable to me, what do you think David?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]     ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
@ 2010-09-27 17:41       ` David Stevens
       [not found]         ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-27 17:41 UTC (permalink / raw)
  To: David Miller
  Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote on 09/23/2010 09:38:23 PM:

> 
> > IGMP timers sometimes fire too rapidly due to randomization of the
> > intervalsfrom 0 to max_delay in igmp_start_timer().
>  ...
> > Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
> 
> 
> This change seems reasonable to me, what do you think David?

[sorry for the delay -- I was off-line for the last few days]
Dave,
        I don't know if you saw the more extended discussion we
had on this or not, but I think while this would help for IB,
it's not appropriate in general. These can in fact be "0" per
RFC which is worst case for IB if there is a delay for being
able to use the group, and the newer IGMPv3 standard has shortened
the max interval from 10sec in v2 to 1 sec.
        Fundamentally, the problem is that the device needs to
be able to send on the group immediately for IGMP; that it
can't for IB is the problem, and I think it should be solved
in IB by either queueing packets there or delaying there as
needed before doing the joins.
        I don't think tweaking IGMP for this is appropriate at
all, but if done there, it ought to be per-interface so it
doesn't change anything for other network types which don't
have this problem. It should be randomized and not the fixed
delays to prevent storms on a mass start-up, and we also don't
want to be increasing the number of duplicates for other
network types. The default should be 2 reports in randomized
0-10 sec for each for v2, 2 in randomized 0-1 sec for v3.

                                                        +-DLS

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]         ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-09-27 17:54           ` David Miller
       [not found]             ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: David Miller @ 2010-09-27 17:54 UTC (permalink / raw)
  To: dlstevens-r/Jw6+rmf7HQT0dZR+AlfA
  Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

From: David Stevens <dlstevens-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Date: Mon, 27 Sep 2010 10:41:20 -0700

>         I don't know if you saw the more extended discussion we
> had on this or not, but I think while this would help for IB,
> it's not appropriate in general. These can in fact be "0" per
> RFC which is worst case for IB if there is a delay for being
> able to use the group, and the newer IGMPv3 standard has shortened
> the max interval from 10sec in v2 to 1 sec.

I did see the extended discussion, and it was interesting :-)

But that mainly focused on the second patch, which I appropriately
marked as needing changes in patchwork.

This patch on the other hand is attacking a different problem,
namely avoiding the worst cases caused by the randomization we
do for the timer.

With bad luck this thing times out way too fast because the total of
all of the randomized intervals can end up being very small, and I
think we should fix that independently of the other issues hit by the
IB folks.

Don't you agree?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]             ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
@ 2010-09-27 18:16               ` David Stevens
  2010-09-27 19:55               ` David Stevens
  1 sibling, 0 replies; 26+ messages in thread
From: David Stevens @ 2010-09-27 18:16 UTC (permalink / raw)
  To: David Miller
  Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org wrote on 09/27/2010 10:54:44 AM:

> From: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> To: David Stevens/Beaverton/IBM@IBMUS
> Cc: cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, 
> netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, rda-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org
> Date: 09/27/2010 10:54 AM
> Subject: Re: igmp: Allow mininum interval specification for igmp timers.
> Sent by: netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> 
> From: David Stevens <dlstevens-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
> Date: Mon, 27 Sep 2010 10:41:20 -0700
> 
> >         I don't know if you saw the more extended discussion we
> > had on this or not, but I think while this would help for IB,
> > it's not appropriate in general. These can in fact be "0" per
> > RFC which is worst case for IB if there is a delay for being
> > able to use the group, and the newer IGMPv3 standard has shortened
> > the max interval from 10sec in v2 to 1 sec.
> 
> I did see the extended discussion, and it was interesting :-)
> 
> But that mainly focused on the second patch, which I appropriately
> marked as needing changes in patchwork.

        OK, I'm not sure I've seen them all; haven't caught up on
e-mail yet.

> This patch on the other hand is attacking a different problem,
> namely avoiding the worst cases caused by the randomization we
> do for the timer.

        I think the multiples are to allow for drops and the
randomization is to prevent storms. As far as IGMP is concerned,
it's perfectly fine to send them back-to-back, since drops are
not necessarily time periods of network outage (as with IB) but
rather transient queue overflows where even the short delay of
a "0" timer but still having protocol and packet transmit delay
would be fine.
 
> With bad luck this thing times out way too fast because the total of
> all of the randomized intervals can end up being very small, and I
> think we should fix that independently of the other issues hit by the
> IB folks.
> 
> Don't you agree?

        If you mean enforcing a minimum spacing higher than a "0" timer,
I don't know that it's an issue for other network types. According to
IGMPv3, all of them (3 total) on average would be sent in 1 sec, but
it also isn't fatal to drop all of them. To the extent that 1 sec is
"small," it is intentional.
        I'll try digging out the particular patch and comment. I'm not
sure many of these tweaks would necessarily hurt other network types
but I think the current code also isn't a problem for anything but IB,
and that issue can be fixed within IB.

                                                                +-DLS


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]                       ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-27 19:24                         ` David Stevens
  0 siblings, 0 replies; 26+ messages in thread
From: David Stevens @ 2010-09-27 19:24 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David S. Miller,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Bob Arendt

Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/23/2010 08:37:48 AM:

> 
> On Wed, 22 Sep 2010, David Stevens wrote:
> 
> > >
> > > Also increment the frequency so that we get a 10 reports send over a
> > > few seconds.
> >
> >         Except you want to conform and not conform at the same time. 
:-)
> > IGMPv2 should be: default count 2, interval 10secs
> > IGMPv3 should be: default count 2, interval 1sec
> 
> This is during the period of unsolicited igmp reports. We do not know if
> this group is managed using V3 or V2 since no igmp query/report has been
> received yet.

        The default is IGMPv3 unless a v2 querier is present. You can 
force
it to be IGMPv2 with by having an IGMPv2 querier on the network or by 
using
the force_igmp_version tunable.

> > ...and no way is it a good idea to send 10 unsolicited reports on an
> > Ethernet.
> 
> Why would that be an issue?

        Because the traffic for all joins is multiplied by >3. If you're
joining 1 group, maybe that wouldn't be an issue, but what if I join
100, and what if hundreds of other hosts on that network do too? And
applications that dynamically join and leave groups may do this 
"normally."
Even 3 reports on switched networks with low loss is really unnecessary
overkill; 10 is just wasted bandwidth.

> The IGMPv2 RFC has no strict limit and RFC3376
> mentions that the retransmission occurs "Robustness Variable" times
> minus one. Choosing 10 for the "Robustness Variable" is certainly ok.

        Both of them specify the default value and say a querier is the
mechanism for changing that. If you want to follow the RFC, the default
is "2", not "10." While it'd be reasonable for a sysadmin to tune this
per-interface without a querier, it's not reasonable to make all linux
systems on all networks more than triple the number of reports they send
from the RFC-specified default. Right?!? :-)
 
> If we do not increase the number of reports but just limit the interval
> then the chance of outages of a second or so during mc group creation
> causing routers missing igmp reports is significantly increased.

        If you can't send on a group for 1 second, all of the initial
IGMPv3 reports will be lost about half of the time if we make that
conformant (it looks like it now uses the 10sec v2 time instead of the
1 sec v3 time it should). That's a problem IB needs to solve. Ideally,
you wouldn't want to return from the hardware join until you can actually
send the reports, but I expect there are locks held and that can't be 1 
second
of spinning on a processor. So, I think you really should put a queue in
IB for that hardware multicast address and send those packets when/if you
get positive acknowledgement (much as done for ARP completion, but maybe
queue more than 1) from the fabric that you can use it. If you don't get
any sort of ACK for that, then you can instrument a delay for it, but
any fixed number you use may be either too big or too small for a
particular fabric.

                                                                +-DLS



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports
       [not found]                         ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  2010-09-23 17:46                           ` Jason Gunthorpe
@ 2010-09-27 19:32                           ` David Stevens
  1 sibling, 0 replies; 26+ messages in thread
From: David Stevens @ 2010-09-27 19:32 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Jason Gunthorpe, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt

Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/23/2010 10:37:28 AM:

> 
> If all unsolicited igmp reports are lost then the router will
> only start forwarding the mc group after the reporting intervals
> (which could be in the range of minutes) when it triggers igmp reports
> through a general igmp query. Until that time the MC group looks dead. 
And
> people and software may conclude that the **** network is broken.

        You can, of course, add a querier (or configure it, assuming
an attached switch supports it) and set the query interval and robustness
count as appropriate for that network.

> This is a general issue for any network where configurations for MC
> forwarding is needed and where initial igmp reports may get lost.

Meaning "IB-only", right? :-) Maybe other NBMA networks too, but
certainly not a typical problem for typical networks (i.e., Ethernet).

> A staggering of time intervals would be a general solution to that 
issue.

As would be having those networks queue packets for hardware addresses 
they
know require a delay before a transmit can complete. But that approach 
can't
adversely affect already-working solutions for typical networks, or
depart unnecessarily from established standard protocols.

 +-DLS

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]             ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
  2010-09-27 18:16               ` David Stevens
@ 2010-09-27 19:55               ` David Stevens
  2010-09-27 20:20                 ` Christoph Lameter
  1 sibling, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-27 19:55 UTC (permalink / raw)
  To: David Miller
  Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote on 09/27/2010 10:54:44 AM:

> This patch on the other hand is attacking a different problem,
> namely avoiding the worst cases caused by the randomization we
> do for the timer.
> 
> With bad luck this thing times out way too fast because the total of
> all of the randomized intervals can end up being very small, and I
> think we should fix that independently of the other issues hit by the
> IB folks.

        I think I'm caught up on the discussion now. For IGMPv3, we
would send all the reports always in < 2 secs, and the average would
be < 1 sec, so I'm not sure any sort of tweaks we do to enforce a
minimum randomized interval are compatible with IGMPv3 and still
solve IB's problem.
        As I said before, I think per protocol, back-to-back is both
allowed and not a problem, even if both subsequent randomized reports
come out to 0 time. But if we wanted to enforce a minimum interval
of, say, X, then I think the better way to do that is to set the
timer to X + rand(Interval-X) and not a table of fixed intervals
as in the original patch. For v2, X=1 or 2 sec and Interval=10
might work well, but for v3, the entire interval is 1 sec and I
think I saw that the set-up time for the fabric may be on the
order of 1 sec.
        I also don't think that we want those kinds of delays on
Ethernet. A program may join and send lots of traffic in 1 sec,
and if the immediate join is lost, one of the quickly-following
<1 sec duplicate reports will make it recover and work. Delaying
the minimum would guarantee it wouldn't work until that minimum
and drop all that traffic if the immediate report is lost, then.
        Really, of course, I think the solution belongs in IB, but
if we did anything in IGMP, I'd prefer it were a per-interface
tunable that defaults as in the RFC. Since you can change the
interval and # of reports through a querier now, exporting the
default values of (10,2) for v2 and (1,2) for v3 to instead be
per-interface tunables and then bumped as needed for IB would
allow tweaking without running a querier. But a querier that's
using default values would also override that and cause the
problem all over again. Queuing in the driver until the MAC
address is usable solves it generally.

        Also, MLD and IPv6 will have all these same issues, and
working multicasting is even more important there.

                                                                +-DLS


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
  2010-09-27 19:55               ` David Stevens
@ 2010-09-27 20:20                 ` Christoph Lameter
       [not found]                   ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-27 20:20 UTC (permalink / raw)
  To: David Stevens; +Cc: David Miller, linux-rdma, netdev, netdev-owner, rda

On Mon, 27 Sep 2010, David Stevens wrote:

> > With bad luck this thing times out way too fast because the total of
> > all of the randomized intervals can end up being very small, and I
> > think we should fix that independently of the other issues hit by the
> > IB folks.
>
>         I think I'm caught up on the discussion now. For IGMPv3, we
> would send all the reports always in < 2 secs, and the average would
> be < 1 sec, so I'm not sure any sort of tweaks we do to enforce a
> minimum randomized interval are compatible with IGMPv3 and still
> solve IB's problem.

Ok thanks for the effort but so far I do not see you having caught up. I'd
rather avoid responding to the misleading statements you made in other
replies and just respond to where you missed the boat here.

>         As I said before, I think per protocol, back-to-back is both
> allowed and not a problem, even if both subsequent randomized reports
> come out to 0 time. But if we wanted to enforce a minimum interval
> of, say, X, then I think the better way to do that is to set the
> timer to X + rand(Interval-X) and not a table of fixed intervals

The second patch sets the intervals to X .. X + Rand (interval) and not to
a table of fixed intervals as you state here. I have pointed this out
before.

> as in the original patch. For v2, X=1 or 2 sec and Interval=10
> might work well, but for v3, the entire interval is 1 sec and I
> think I saw that the set-up time for the fabric may be on the
> order of 1 sec.

Again there is no knowledge about V2 or V3 without a query and this is
during the period when no querier is known yet. You stated elsewhere that
I can assume V3 by default? So 1 sec?

>         I also don't think that we want those kinds of delays on
> Ethernet. A program may join and send lots of traffic in 1 sec,
> and if the immediate join is lost, one of the quickly-following
> <1 sec duplicate reports will make it recover and work. Delaying
> the minimum would guarantee it wouldn't work until that minimum
> and drop all that traffic if the immediate report is lost, then.

There can be any number of reasons that a short outage could prevent the
packets from going through. A buffer overrun (that you mentioned
elsewhere) usually causes lots of packets to be lost. Buffer overrun
scenarios usually mean that all igmp queries are lost.

>         Really, of course, I think the solution belongs in IB, but
> if we did anything in IGMP, I'd prefer it were a per-interface
> tunable that defaults as in the RFC. Since you can change the
> interval and # of reports through a querier now, exporting the
> default values of (10,2) for v2 and (1,2) for v3 to instead be
> per-interface tunables and then bumped as needed for IB would
> allow tweaking without running a querier. But a querier that's
> using default values would also override that and cause the
> problem all over again. Queuing in the driver until the MAC
> address is usable solves it generally.

There is no solution on the IB layer since there is no notification when
the fabric reconfiguration necessary for an multicast group is complete.

The querier is of not use since (for the gazillionth of times) this is an
unsolicited IGMP report. If there is a querier then the unsolicited igmp
reports would not be used but the timeout indicated by the querier would
be used.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]                   ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-09-27 21:45                     ` David Stevens
       [not found]                       ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 26+ messages in thread
From: David Stevens @ 2010-09-27 21:45 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: David Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/27/2010 01:20:54 PM:

> 
> >         As I said before, I think per protocol, back-to-back is both
> > allowed and not a problem, even if both subsequent randomized reports
> > come out to 0 time. But if we wanted to enforce a minimum interval
> > of, say, X, then I think the better way to do that is to set the
> > timer to X + rand(Interval-X) and not a table of fixed intervals
> 
> The second patch sets the intervals to X .. X + Rand (interval) and not 
to
> a table of fixed intervals as you state here. I have pointed this out
> before.

        Sorry if I've misunderstood something you're proposing, but what
you describe above would be certainly technically incorrect. There are
really no circumstances for sending a report greater than <Interval>
that is protocol-compliant. You can enforce a minimum greater than 0,
which is a departure from both RFCs, though IGMPv2 uses wishy-washy
language. The intent for both was to explicitly allow 0, IMO.

> 
> > as in the original patch. For v2, X=1 or 2 sec and Interval=10
> > might work well, but for v3, the entire interval is 1 sec and I
> > think I saw that the set-up time for the fabric may be on the
> > order of 1 sec.
> 
> Again there is no knowledge about V2 or V3 without a query and this is
> during the period when no querier is known yet. You stated elsewhere 
that
> I can assume V3 by default? So 1 sec?

        Yes, without a querier or the tunable to force it to IGMPv2,
the default is IGMPv3. It appears there is a bug where IGMPv3 is also
using a 10sec interval (haven't verified that), but a 1 sec interval
as required makes your situation worse, not better. It makes it even
more likely that all the initial reports will occur before your set-up
is done.
 
> There can be any number of reasons that a short outage could prevent the
> packets from going through. A buffer overrun (that you mentioned
> elsewhere) usually causes lots of packets to be lost. Buffer overrun
> scenarios usually mean that all igmp queries are lost.

        You're arguing against protocol compliance. I didn't define
the protocol, I only implemented it. And your view is through the
IB lens, but I don't believe this is an actual problem in any way
for typical networks. If you wrote a standards-track RFC that modifies
IGMP for NBMA networks that require a delay or different parameters
there, I'd have no objection to implementing that. Unilaterally
changing linux's behavior on all network types without cause for
departing from RFC on the most common types is another matter.


> There is no solution on the IB layer since there is no notification when
> the fabric reconfiguration necessary for an multicast group is complete.

        Certainly that's not true; without notification, you can queue for
first use of a new hardware multicast address and send the queue after an
appropriate delay (1 sec? If that covers your set-up time). If you had
positive acknowledgement from the IB network, you'd know exactly when to
do it, but there's no need to change anything for non-IB networks here.

> The querier is of not use since (for the gazillionth of times) this is 
an
> unsolicited IGMP report. If there is a querier then the unsolicited igmp
> reports would not be used but the timeout indicated by the querier would
> be used.

        A querier affects unsolicited reports because it sets both the
query interval and the robustness value. If you want to send 10 reports,
you can cause that by having a querier that sets it to that many. The
initial join would then send 10 reports and the query interval can also
be as low as you like.
        But the linux code is not just for your particular problem or
particular configuration. You can solve your problem by adding a querier,
but I know you're trying to do it without. The mail I was responding to
referred also to the case of a querier present, which is actually the
"normal" case for using full IGMP is. I'm saying that for the non-querier
case, making those per-interface configurable is reasonable because
they *are* querier-changeable, but you can also use a querier to change
it _for_the_unsolicited_reports_, as well as making the querier interval
small enough that you don't have to care at all whether any or all of
the unsolicited reports are lost.

                                                                +-DLS

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: igmp: Allow mininum interval specification for igmp timers.
       [not found]                       ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2010-09-28 18:42                         ` Christoph Lameter
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2010-09-28 18:42 UTC (permalink / raw)
  To: David Stevens
  Cc: David Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA

On Mon, 27 Sep 2010, David Stevens wrote:

> > The second patch sets the intervals to X .. X + Rand (interval) and not
> to
> > a table of fixed intervals as you state here. I have pointed this out
> > before.
>
>         Sorry if I've misunderstood something you're proposing, but what
> you describe above would be certainly technically incorrect. There are
> really no circumstances for sending a report greater than <Interval>
> that is protocol-compliant. You can enforce a minimum greater than 0,
> which is a departure from both RFCs, though IGMPv2 uses wishy-washy
> language. The intent for both was to explicitly allow 0, IMO.

There is no igmp interval set by any igmp query yet so this is your usual
unresponsive crappy response to something else that we are not talking
about.

I thought you were talking about the "fixed intervals" that you saw in the
patch. These initial intervals are for unsolicited igmp reports (do I need
to add that statement to every sentence in a thread where we *only*
discuss unsolicited igmp issues?) and those "intervals" are randomized
and not fixed.

> > > as in the original patch. For v2, X=1 or 2 sec and Interval=10
> > > might work well, but for v3, the entire interval is 1 sec and I
> > > think I saw that the set-up time for the fabric may be on the
> > > order of 1 sec.
> >
> > Again there is no knowledge about V2 or V3 without a query and this is
> > during the period when no querier is known yet. You stated elsewhere
> that
> > I can assume V3 by default? So 1 sec?
>
>         Yes, without a querier or the tunable to force it to IGMPv2,
> the default is IGMPv3. It appears there is a bug where IGMPv3 is also
> using a 10sec interval (haven't verified that), but a 1 sec interval

You do not have the linux source code tree available?

from net/ipv4/igmp.c:

#define IGMP_Unsolicited_Report_Interval        (10*HZ)

> as required makes your situation worse, not better. It makes it even
> more likely that all the initial reports will occur before your set-up
> is done.

Right. So can you please give me an approach that considers all these
issues and does not invent problem that do not exist, stays within the
subject discussed and follows the RFCs?

  > > There can be any number of reasons that a short outage could prevent the
> > packets from going through. A buffer overrun (that you mentioned
> > elsewhere) usually causes lots of packets to be lost. Buffer overrun
> > scenarios usually mean that all igmp queries are lost.
>
>         You're arguing against protocol compliance. I didn't define
> the protocol, I only implemented it. And your view is through the
> IB lens, but I don't believe this is an actual problem in any way
> for typical networks. If you wrote a standards-track RFC that modifies
> IGMP for NBMA networks that require a delay or different parameters
> there, I'd have no objection to implementing that. Unilaterally
> changing linux's behavior on all network types without cause for
> departing from RFC on the most common types is another matter.

The RFCs state that the igmp queries have to be repeated at least 3
times. The first patch ensures that a mininum time passes between two igmp
reports (to avoid them getting lost in one go). The second patch doubles
the number of igmp reports and increases the intervals so that we still
have a chance to process the join before the next igmp query is send by
the router (which can be minuates away).

It fixes buggy havior that we see because multicast joins take very long
or fail outright.

 > > There is no solution on the IB layer since there is no notification when
> > the fabric reconfiguration necessary for an multicast group is complete.
>
>         Certainly that's not true; without notification, you can queue for
> first use of a new hardware multicast address and send the queue after an
> appropriate delay (1 sec? If that covers your set-up time). If you had
> positive acknowledgement from the IB network, you'd know exactly when to
> do it, but there's no need to change anything for non-IB networks here.

So you want an arbitrary delay for all new multicast traffic to be
created? I'd rather have a series of staggered attempts so that we can
avoid this setup time.

Also the setup time varies greatly depending on the complexity of the
fabric changes. It can be extremely fast if the multicast group is already
in use by others in the fabric. Adding a delay penalizes everyone
unnecessarily.

Also much of these seems to be contigent on IGMPv3. We are using igmpv2.

> > The querier is of not use since (for the gazillionth of times) this is
> an
> > unsolicited IGMP report. If there is a querier then the unsolicited igmp
> > reports would not be used but the timeout indicated by the querier would
> > be used.
>
>         A querier affects unsolicited reports because it sets both the
> query interval and the robustness value. If you want to send 10 reports,
> you can cause that by having a querier that sets it to that many. The
> initial join would then send 10 reports and the query interval can also
> be as low as you like.

The Linux IGMP subsystem does not support either of those at this point.
When the multicast group is created it has no notion of query intervals
until the first igmp query is received. That is the period of interest
that we are discussing!

>         But the linux code is not just for your particular problem or
> particular configuration. You can solve your problem by adding a querier,
> but I know you're trying to do it without. The mail I was responding to
> referred also to the case of a querier present, which is actually the
> "normal" case for using full IGMP is. I'm saying that for the non-querier
> case, making those per-interface configurable is reasonable because
> they *are* querier-changeable, but you can also use a querier to change
> it _for_the_unsolicited_reports_, as well as making the querier interval
> small enough that you don't have to care at all whether any or all of
> the unsolicited reports are lost.

The network of course has a querier that sents igmp requests in intervals
that could span minutes. We are talking about the period of time after a
join when a multicast group has been created but we have not reached the
time when the router sends an igmp query and where the various bits of
information about the igmp handling can be determined for the multicast
group.

I am not sure that you comprehend the basics of IGMP processing in the
kernel. I see knowledge about IGMP in general but you have a difficult
time to relate that to what the Linux kernel actually does.

Would you please have a look at the source code?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2010-09-28 18:42 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-22 18:59 igmp: Allow mininum interval specification for igmp timers Christoph Lameter
     [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 19:01   ` igmp: Staggered igmp report intervals for unsolicited igmp reports Christoph Lameter
     [not found]     ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 19:30       ` David Stevens
     [not found]         ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-22 19:58           ` Christoph Lameter
     [not found]             ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 20:36               ` David Stevens
2010-09-22 21:26                 ` Christoph Lameter
2010-09-22 21:50               ` Jason Gunthorpe
2010-09-23 15:32                 ` Christoph Lameter
2010-09-23 17:26                   ` Jason Gunthorpe
     [not found]                     ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-09-23 17:37                       ` Christoph Lameter
     [not found]                         ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-23 17:46                           ` Jason Gunthorpe
     [not found]                             ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-09-23 17:56                               ` Christoph Lameter
2010-09-27 19:32                           ` David Stevens
2010-09-22 20:56             ` Bob Arendt
     [not found]               ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org>
2010-09-22 21:33                 ` Christoph Lameter
2010-09-22 21:41                   ` David Stevens
2010-09-23 15:37                     ` Christoph Lameter
     [not found]                       ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-27 19:24                         ` David Stevens
2010-09-24  4:38   ` igmp: Allow mininum interval specification for igmp timers David Miller
     [not found]     ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2010-09-27 17:41       ` David Stevens
     [not found]         ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-27 17:54           ` David Miller
     [not found]             ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2010-09-27 18:16               ` David Stevens
2010-09-27 19:55               ` David Stevens
2010-09-27 20:20                 ` Christoph Lameter
     [not found]                   ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-27 21:45                     ` David Stevens
     [not found]                       ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-28 18:42                         ` Christoph Lameter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).