* igmp: Allow mininum interval specification for igmp timers.
@ 2010-09-22 18:59 Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2010-09-22 18:59 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
Cc: Bob Arendt, David S. Miller, David L Stevens
IGMP timers sometimes fire too rapidly due to randomization of the
intervalsfrom 0 to max_delay in igmp_start_timer(). For some situations
(like the initial IGMP reports that are not responses to an IGMP query) we
do not want them in too rapid succession otherwise all the initial reports
may be lost due to a race conditions with the reconfiguration of the
routers and switches going on via the link layer (like on Infiniband). If
those are lost then the router will only discover that a new mc group was
joined when the igmp query was sent. General IGMP queries may be sent
rarely on large fabrics resulting in excessively long wait times until
data starts flowing. The application may abort before then concluding that
the network hardware is not operational.
The worst case scenario without the changes will send 3 igmp reports on join:
First 3 jiffies ("immediate" (spec) ~3 ms)
Second 3 jiffies (randomization leads to shortest interval) 3 ms
Third 3 jiffies (randomization leads to shortest interval) 3 ms
Which may result in a total of less than 10ms until the kernel gives up sending
igmp requests.
Change the IGMP layer to allow the specification of minimum and maximum delay.
Calculate the IGMP_Unsolicated_Report interval based on what the interval
before this patch would be on a 100HZ kernel. 3 jiffies at 100 HZ would result
in a mininum ~30 milliseconds spacing between the initial two IGMP reports.
Round it up to 40ms.
This will result in 3 initial unsolicited reports
First "immediately" 3 jiffies (~ 3ms)
Second randomized 40ms to 10seconds later
Third randomized 40ms to 10seconds later
So a mininum of ~83ms will pass before the unsolicted reports are
given up.
Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
---
net/ipv4/igmp.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
Index: linux-2.6/net/ipv4/igmp.c
===================================================================
--- linux-2.6.orig/net/ipv4/igmp.c 2010-09-22 11:15:19.000000000 -0500
+++ linux-2.6/net/ipv4/igmp.c 2010-09-22 12:50:32.000000000 -0500
@@ -116,10 +116,17 @@
#define IGMP_V2_Router_Present_Timeout (400*HZ)
#define IGMP_Unsolicited_Report_Interval (10*HZ)
#define IGMP_Query_Response_Interval (10*HZ)
-#define IGMP_Unsolicited_Report_Count 2
+/* Parameters not specified in igmp rfc. */
+
+/* Mininum ticks to have a meaningful notion of delay */
+#define IGMP_Mininum_Delay (2)
+
+/* Control of unsolilcited reports (after join) */
+#define IGMP_Unsolicited_Report_Count 2
#define IGMP_Initial_Report_Delay (1)
+#define IGMP_Unsolicited_Report_Min_Delay (HZ/25)
/* IGMP_Initial_Report_Delay is not from IGMP specs!
* IGMP specs require to report membership immediately after
@@ -174,22 +181,30 @@ static __inline__ void igmp_stop_timer(s
spin_unlock_bh(&im->lock);
}
-/* It must be called with locked im->lock */
-static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+static inline unsigned long jiffies_rand_delay(int min_delay, int max_delay)
{
- int tv = net_random() % max_delay;
+ int d = min_delay;
+
+ if (min_delay < max_delay)
+ d += net_random() % (max_delay - min_delay);
+ return jiffies + d;
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int min_delay, int max_delay)
+{
im->tm_running = 1;
- if (!mod_timer(&im->timer, jiffies+tv+2))
+ if (!mod_timer(&im->timer, jiffies_rand_delay(min_delay, max_delay)))
atomic_inc(&im->refcnt);
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = net_random() % in_dev->mr_maxdelay;
-
in_dev->mr_gq_running = 1;
- if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+ if (!mod_timer(&in_dev->mr_gq_timer,
+ jiffies_rand_delay(IGMP_Mininum_Delay,
+ in_dev->mr_maxdelay)))
in_dev_hold(in_dev);
}
@@ -201,7 +216,7 @@ static void igmp_ifc_start_timer(struct
in_dev_hold(in_dev);
}
-static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+static void igmp_mod_timer(struct ip_mc_list *im, int min_delay, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
@@ -214,7 +229,7 @@ static void igmp_mod_timer(struct ip_mc_
}
atomic_dec(&im->refcnt);
}
- igmp_start_timer(im, max_delay);
+ igmp_start_timer(im, min_delay, max_delay);
spin_unlock_bh(&im->lock);
}
@@ -733,7 +748,8 @@ static void igmp_timer_expire(unsigned l
if (im->unsolicit_count) {
im->unsolicit_count--;
- igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+ igmp_start_timer(im, IGMP_Unsolicited_Report_Min_Delay,
+ IGMP_Unsolicited_Report_Interval);
}
im->reporter = 1;
spin_unlock(&im->lock);
@@ -911,7 +927,7 @@ static void igmp_heard_query(struct in_d
igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
spin_unlock_bh(&im->lock);
if (changed)
- igmp_mod_timer(im, max_delay);
+ igmp_mod_timer(im, IGMP_Mininum_Delay, max_delay);
}
read_unlock(&in_dev->mc_list_lock);
}
@@ -1169,7 +1185,7 @@ static void igmp_group_added(struct ip_m
return;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
- igmp_start_timer(im, IGMP_Initial_Report_Delay);
+ igmp_start_timer(im, IGMP_Mininum_Delay, IGMP_Initial_Report_Delay);
spin_unlock_bh(&im->lock);
return;
}
@@ -1258,7 +1274,8 @@ void ip_mc_rejoin_group(struct ip_mc_lis
return;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
- igmp_mod_timer(im, IGMP_Initial_Report_Delay);
+ igmp_mod_timer(im, IGMP_Mininum_Delay,
+ IGMP_Initial_Report_Delay);
return;
}
/* else, v3 */
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 26+ messages in thread[parent not found: <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-22 19:01 ` Christoph Lameter [not found] ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 2010-09-24 4:38 ` igmp: Allow mininum interval specification for igmp timers David Miller 1 sibling, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-22 19:01 UTC (permalink / raw) To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA Cc: Bob Arendt, David S. Miller, David L Stevens The earlier patch added an initial mininum latency and got us up to ~80ms. However, there are large networks that take longer to configure multicast paths. This patch changes the behavior for unsolicited igmp reports to ensure that even sporadic loss of the initial IGMP reports will result in a reasonable fast subscription. The rfc states that the first igmp report should be sent immediately and then mentions that a couple of more should be sent but does not specify exactly how the repeating of the igmp reports should occur. The RFC suggests that the behavior in response to an IGMP report (randomized response 0-max response time) could be followed but we have seen issues with this suggestion since the intervals can be very short. There is also no reason to randomize since the unsolicited reports are not a response to an igmp query but the result of a join request in the code. The patch here establishes more fixed delays for sending unsolicited igmp reports after join. There is still a fuzz factor associated but the sending of the igmp reports follows more tightly a set of intervals and sends up to 7 igmp reports. IGMP Report Time delay ------------------------------------------------------------ 0 3 ticks "immediate" accordig to RFC. 1 40ms 2 200ms 3 1sec 4 5sec 5 10sec 6 60sec So unsolicited reports are send for an interval of at least a minute (reports are aborted if igmp reports or other info is seen). Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> --- net/ipv4/igmp.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) Index: linux-2.6/net/ipv4/igmp.c =================================================================== --- linux-2.6.orig/net/ipv4/igmp.c 2010-09-22 12:50:32.000000000 -0500 +++ linux-2.6/net/ipv4/igmp.c 2010-09-22 13:32:58.000000000 -0500 @@ -124,17 +124,40 @@ /* Control of unsolilcited reports (after join) */ -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Unsolicited_Report_Count 6 #define IGMP_Initial_Report_Delay (1) #define IGMP_Unsolicited_Report_Min_Delay (HZ/25) +#define IGMP_Unsolicited_Fuzz (HZ/100) + /* IGMP_Initial_Report_Delay is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not * contradict to specs provided this delay is small enough. + * + * The spec does not say how the initial igmp reports + * need to be repeated (aside from suggesting to just do the + * randomization of the intervals as for igmp queries but then + * there is no centralized trigger and therefore no randomization + * needed). We provide an array of delays here that are likely + * to work in general avoiding the often too short or too long intervals + * that would be generated if we would follow the suggestion in the rfc. + * + * Note that the sending of unsolicited reports may stop at any point + * if we see an igmp query from a router or a neighbors ignmp report. */ +static int unsolicited_delay[IGMP_Unsolicited_Report_Count + 1] = { + IGMP_Initial_Report_Delay + IGMP_Mininum_Delay, /* "Immediate" */ + HZ / 25, /* 40ms */ + HZ / 5, /* 200ms */ + HZ, + 5 * HZ, + 10 * HZ, + 60 * HZ +}; + #define IGMP_V1_SEEN(in_dev) \ (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ @@ -199,6 +222,13 @@ static void igmp_start_timer(struct ip_m atomic_inc(&im->refcnt); } +static void igmp_start_initial_timer(struct ip_mc_list *im, int interval) +{ + int delay = unsolicited_delay[interval]; + + igmp_start_timer(im, delay, delay + IGMP_Unsolicited_Fuzz); +} + static void igmp_gq_start_timer(struct in_device *in_dev) { in_dev->mr_gq_running = 1; @@ -748,8 +778,8 @@ static void igmp_timer_expire(unsigned l if (im->unsolicit_count) { im->unsolicit_count--; - igmp_start_timer(im, IGMP_Unsolicited_Report_Min_Delay, - IGMP_Unsolicited_Report_Interval); + igmp_start_initial_timer(im, + IGMP_Unsolicited_Report_Count - im->unsolicit_count); } im->reporter = 1; spin_unlock(&im->lock); @@ -1185,7 +1215,7 @@ static void igmp_group_added(struct ip_m return; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); - igmp_start_timer(im, IGMP_Mininum_Delay, IGMP_Initial_Report_Delay); + igmp_start_initial_timer(im, 0); spin_unlock_bh(&im->lock); return; } -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-22 19:30 ` David Stevens [not found] ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-22 19:30 UTC (permalink / raw) To: Christoph Lameter Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/22/2010 12:01:28 PM: > The earlier patch added an initial mininum latency and got us up to > ~80ms. However, there are large networks that take longer to configure > multicast paths. I feel your pain, but the protocol allows this to be 0 and all of the unsolicited reports can be lost. I don't think adding a minimum latency solves a general problem. Perhaps the device should queue some packets if it isn't ready quickly? A querier is what makes these reliable, but for the start-up in particular, I think it'd be better to not initiate the send on devices that have this problem until the device is actually ready to send-- why not put the delay in the device driver on initialization? > with this suggestion since the intervals can be very short. There is also > no reason to randomize since the unsolicited reports are not a response to > an igmp query but the result of a join request in the code. These are also staggered to prevent a storm by mass reboots, e.g., from a power outage, and the default groups are joined on interface bring-up. > The patch here establishes more fixed delays for sending unsolicited > igmp reports after join. There is still a fuzz factor associated but the > sending of the igmp reports follows more tightly a set of intervals and sends > up to 7 igmp reports. > > IGMP Report Time delay > ------------------------------------------------------------ > 0 3 ticks "immediate" accordig to RFC. > 1 40ms > 2 200ms > 3 1sec > 4 5sec > 5 10sec > 6 60sec > > So unsolicited reports are send for an interval of at least a minute > (reports are aborted if igmp reports or other info is seen). This is outside the protocol spec, and the intervals are neither random nor scaled based on any network performance metric. 1) I'm not sure there's a problem here to solve, other than for your particular hardware. 2) I think this would better be solved in the driver-- don't do the upper initialization and group joins until the sends can actually succeed. 3) I don't think it's a good idea to make up intervals, and especially non-randomized ones. The probability of getting all minimum intervals is very low (which goes back to #1) and sending fixed intervals may introduce a problem (packet storms) that isn't there per RFC. These fixed intervals can also be either way too long or way too short, depending on link characteristics they don't account for. Leaving the intervals randomized based on querier-supplied data seems much more appropriate to me. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2010-09-22 19:58 ` Christoph Lameter 2010-09-22 20:56 ` Bob Arendt [not found] ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 0 siblings, 2 replies; 26+ messages in thread From: Christoph Lameter @ 2010-09-22 19:58 UTC (permalink / raw) To: David Stevens Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt On Wed, 22 Sep 2010, David Stevens wrote: > I feel your pain, but the protocol allows this to be 0 and all > of the unsolicited reports can be lost. I don't think adding a minimum > latency solves a general problem. Perhaps the device should queue some The protocol does not specificy the intervals during unsolicited igmp sends. It only specifies the intervals as a result of a igmp query. > packets if it isn't ready quickly? A querier is what makes these > reliable, but for the start-up in particular, I think it'd be better > to not initiate the send on devices that have this problem until the > device is actually ready to send-- why not put the delay in the device > driver on initialization? The device is ready. Its just the multicast group that has not been established yet. > > an igmp query but the result of a join request in the code. > > These are also staggered to prevent a storm by mass reboots, e.g., > from a power outage, and the default groups are joined on interface > bring-up. There is still some staggering left (see IGMP_Unsolicited_Fuzz). I can increase that if necessary. There also cannot be any storm since any unsolicited igmp report by any system will stop the unsolicited igmp reports by any other system. > > So unsolicited reports are send for an interval of at least a minute > > (reports are aborted if igmp reports or other info is seen). > > This is outside the protocol spec, and the intervals are neither > random nor scaled based on any network performance metric. Where does it say that in the spec? Again this is an *unsolicited* igmp report. > 2) I think this would better be solved in the driver-- don't do the > upper initialization and group joins until the sends can actually > succeed. The driver is fine. Its just the multicast path in the network that take time to establish. > 3) I don't think it's a good idea to make up intervals, and especially > non-randomized ones. The probability of getting all minimum > intervals > is very low (which goes back to #1) and sending fixed intervals > may > introduce a problem (packet storms) that isn't there per RFC. > These > fixed intervals can also be either way too long or way too short, > depending on link characteristics they don't account for. Leaving > the intervals randomized based on querier-supplied data seems much > more appropriate to me. These are *unsolicited* igmp reports. There is *no* querier supplied data yet. The first querier supplied data (or any other unsolicited igmp report) will immediately stop the unsolicited reports and then will continue to respond in randomized intervals based on the data that the querier has supplied. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-22 19:58 ` Christoph Lameter @ 2010-09-22 20:56 ` Bob Arendt [not found] ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org> [not found] ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 1 sibling, 1 reply; 26+ messages in thread From: Bob Arendt @ 2010-09-22 20:56 UTC (permalink / raw) To: Christoph Lameter Cc: David Stevens, David S. Miller, linux-rdma@vger.kernel.org, netdev@vger.kernel.org On 09/22/2010 12:58 PM, Christoph Lameter wrote: > On Wed, 22 Sep 2010, David Stevens wrote: >> 3) I don't think it's a good idea to make up intervals, and especially >> non-randomized ones. The probability of getting all minimum >> intervals >> is very low (which goes back to #1) and sending fixed intervals >> may >> introduce a problem (packet storms) that isn't there per RFC. >> These >> fixed intervals can also be either way too long or way too short, >> depending on link characteristics they don't account for. Leaving >> the intervals randomized based on querier-supplied data seems much >> more appropriate to me. > > These are *unsolicited* igmp reports. There is *no* querier supplied data > yet. The first querier supplied data (or any other unsolicited igmp > report) will immediately stop the unsolicited reports and then will > continue to respond in randomized intervals based on the data that the > querier has supplied. > > There certainly seems to be some backing for part of Christoph's concept in the IETF rfc's. I've posted the relevant sections below. IGMPv2 doesn't specify a limit on retransmissions of an unsolicited Join, only that they stop once multicast traffic is received. While IGMPv2 defines an "Unsolicited Report Interval" default of 10 seconds, it appears that this is a significant enough issue that the later IGMPv3 document calls out a default of 1 second, and goes on to define a "Robustness Variable" and talks about the same case that Christoph is trying to mitigate. However, both rfc's *do* specify that the random timers should be used based on a value called the "unsolicited report interval". Perhaps implementing the IGMPv3 capability with kernel parameters for an "unsolicited report interval" and "robustness variable" would satisfy Christoph's issue? -Bob Arendt rfc2236 IGMPv2 ============================= Section 3 .... page 4 para 2 When a host joins a multicast group, it should immediately transmit an unsolicited Version 2 Membership Report for that group, in case it is the first member of that group on the network. To cover the possibility of the initial Membership Report being lost or damaged, it is recommended that it be repeated once or twice after short delays [Unsolicited Report Interval]. Section 6 ... page 8 para 4 - "start timer" for the group on the interface, using a delay value chosen uniformly from the interval (0, Max Response Time], where Max Response time is specified in the Query. If this is an unsolicited Report, the timer is set to a delay value chosen uniformly from the interval (0, [Unsolicited Report Interval] ]. 8.10. Unsolicited Report Interval (page 18) The Unsolicited Report Interval is the time between repetitions of a host's initial report of membership in a group. Default: 10 seconds. rfc3376 IGMPv3 ============================ Section 5.1 page 19, near end (note - unsolicited Join is a type of State-Change report) To cover the possibility of the State-Change Report being missed by one or more multicast routers, it is retransmitted [Robustness Variable] - 1 more times, at intervals chosen at random from the range (0, [Unsolicited Report Interval]). 8.11. Unsolicited Report Interval (page 41) The Unsolicited Report Interval is the time between repetitions of a host's initial report of membership in a group. Default: 1 second. 8.1. Robustness Variable (page 39) The Robustness Variable allows tuning for the expected packet loss on a network. If a network is expected to be lossy, the Robustness Variable may be increased. IGMP is robust to (Robustness Variable - 1) packet losses. The Robustness Variable MUST NOT be zero, and SHOULD NOT be one. Default: 2 8.14.1. Robustness Variable (page 41/42) The Robustness Variable tunes IGMP to expected losses on a link. IGMPv3 is robust to (Robustness Variable - 1) packet losses, e.g., if the Robustness Variable is set to the default value of 2, IGMPv3 is robust to a single packet loss but may operate imperfectly if more losses occur. On lossy subnetworks, the Robustness Variable should be increased to allow for the expected level of packet loss. However, increasing the Robustness Variable increases the leave latency of the subnetwork. (The leave latency is the time between when the last member stops listening to a source or group and when the traffic stops flowing.) ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org> @ 2010-09-22 21:33 ` Christoph Lameter 2010-09-22 21:41 ` David Stevens 0 siblings, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-22 21:33 UTC (permalink / raw) To: Bob Arendt Cc: David Stevens, David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org On Wed, 22 Sep 2010, Bob Arendt wrote: > multicast traffic is received. While IGMPv2 defines an "Unsolicited Report > Interval" default of 10 seconds, it appears that this is a significant enough > issue that the later IGMPv3 document calls out a default of 1 second, and > goes on to define a "Robustness Variable" and talks about the same case that > Christoph is trying to mitigate. Actually that suggests a different way to reach the same goal: Subject: igmp: Make unsolicited report interval conform to RFC3376 RFC3376 specifies a shorter time interval for sending igmp joins. This can address issues where joins are slow because the initial join is frequently lost. Also increment the frequency so that we get a 10 reports send over a few seconds. Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> --- net/ipv4/igmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) Index: linux-2.6/net/ipv4/igmp.c =================================================================== --- linux-2.6.orig/net/ipv4/igmp.c 2010-09-22 16:28:17.000000000 -0500 +++ linux-2.6/net/ipv4/igmp.c 2010-09-22 16:28:54.000000000 -0500 @@ -114,9 +114,9 @@ #define IGMP_V1_Router_Present_Timeout (400*HZ) #define IGMP_V2_Router_Present_Timeout (400*HZ) -#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Interval (HZ) #define IGMP_Query_Response_Interval (10*HZ) -#define IGMP_Unsolicited_Report_Count 2 +#define IGMP_Unsolicited_Report_Count 10 #define IGMP_Initial_Report_Delay (1) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-22 21:33 ` Christoph Lameter @ 2010-09-22 21:41 ` David Stevens 2010-09-23 15:37 ` Christoph Lameter 0 siblings, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-22 21:41 UTC (permalink / raw) To: Christoph Lameter Cc: David S. Miller, linux-rdma@vger.kernel.org, netdev@vger.kernel.org, Bob Arendt Christoph Lameter <cl@linux.com> wrote on 09/22/2010 02:33:14 PM: This can address issues where joins are slow because the initial join is > frequently lost. > > Also increment the frequency so that we get a 10 reports send over a > few seconds. Except you want to conform and not conform at the same time. :-) IGMPv2 should be: default count 2, interval 10secs IGMPv3 should be: default count 2, interval 1sec ...and no way is it a good idea to send 10 unsolicited reports on an Ethernet. I think system-wide defaults must be as suggested (which allows for v3 being shortened to 1sec, but not v2) and if you want to use longer values, you should have either a *per-interface* tunable [ie, the default value for your interface only] or make these per-interface variables and have the IB code bump them up for IB interfaces only. An attached Ethernet on the same system shouldn't be using larger values unless bumped for some reason by an administrator. There is no problem with current values on Ethernet; lets not create one. :-) +-DLS ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-22 21:41 ` David Stevens @ 2010-09-23 15:37 ` Christoph Lameter [not found] ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-23 15:37 UTC (permalink / raw) To: David Stevens Cc: David S. Miller, linux-rdma@vger.kernel.org, netdev@vger.kernel.org, Bob Arendt On Wed, 22 Sep 2010, David Stevens wrote: > > > > Also increment the frequency so that we get a 10 reports send over a > > few seconds. > > Except you want to conform and not conform at the same time. :-) > IGMPv2 should be: default count 2, interval 10secs > IGMPv3 should be: default count 2, interval 1sec This is during the period of unsolicited igmp reports. We do not know if this group is managed using V3 or V2 since no igmp query/report has been received yet. > ...and no way is it a good idea to send 10 unsolicited reports on an > Ethernet. Why would that be an issue? The IGMPv2 RFC has no strict limit and RFC3376 mentions that the retransmission occurs "Robustness Variable" times minus one. Choosing 10 for the "Robustness Variable" is certainly ok. If we do not increase the number of reports but just limit the interval then the chance of outages of a second or so during mc group creation causing routers missing igmp reports is significantly increased. ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-27 19:24 ` David Stevens 0 siblings, 0 replies; 26+ messages in thread From: David Stevens @ 2010-09-27 19:24 UTC (permalink / raw) To: Christoph Lameter Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, Bob Arendt Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/23/2010 08:37:48 AM: > > On Wed, 22 Sep 2010, David Stevens wrote: > > > > > > > Also increment the frequency so that we get a 10 reports send over a > > > few seconds. > > > > Except you want to conform and not conform at the same time. :-) > > IGMPv2 should be: default count 2, interval 10secs > > IGMPv3 should be: default count 2, interval 1sec > > This is during the period of unsolicited igmp reports. We do not know if > this group is managed using V3 or V2 since no igmp query/report has been > received yet. The default is IGMPv3 unless a v2 querier is present. You can force it to be IGMPv2 with by having an IGMPv2 querier on the network or by using the force_igmp_version tunable. > > ...and no way is it a good idea to send 10 unsolicited reports on an > > Ethernet. > > Why would that be an issue? Because the traffic for all joins is multiplied by >3. If you're joining 1 group, maybe that wouldn't be an issue, but what if I join 100, and what if hundreds of other hosts on that network do too? And applications that dynamically join and leave groups may do this "normally." Even 3 reports on switched networks with low loss is really unnecessary overkill; 10 is just wasted bandwidth. > The IGMPv2 RFC has no strict limit and RFC3376 > mentions that the retransmission occurs "Robustness Variable" times > minus one. Choosing 10 for the "Robustness Variable" is certainly ok. Both of them specify the default value and say a querier is the mechanism for changing that. If you want to follow the RFC, the default is "2", not "10." While it'd be reasonable for a sysadmin to tune this per-interface without a querier, it's not reasonable to make all linux systems on all networks more than triple the number of reports they send from the RFC-specified default. Right?!? :-) > If we do not increase the number of reports but just limit the interval > then the chance of outages of a second or so during mc group creation > causing routers missing igmp reports is significantly increased. If you can't send on a group for 1 second, all of the initial IGMPv3 reports will be lost about half of the time if we make that conformant (it looks like it now uses the 10sec v2 time instead of the 1 sec v3 time it should). That's a problem IB needs to solve. Ideally, you wouldn't want to return from the hardware join until you can actually send the reports, but I expect there are locks held and that can't be 1 second of spinning on a processor. So, I think you really should put a queue in IB for that hardware multicast address and send those packets when/if you get positive acknowledgement (much as done for ARP completion, but maybe queue more than 1) from the fabric that you can use it. If you don't get any sort of ACK for that, then you can instrument a delay for it, but any fixed number you use may be either too big or too small for a particular fabric. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-22 20:36 ` David Stevens 2010-09-22 21:26 ` Christoph Lameter 2010-09-22 21:50 ` Jason Gunthorpe 1 sibling, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-22 20:36 UTC (permalink / raw) To: Christoph Lameter Cc: David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote: > > On Wed, 22 Sep 2010, David Stevens wrote: > > > I feel your pain, but the protocol allows this to be 0 and all > > of the unsolicited reports can be lost. I don't think adding a minimum > > latency solves a general problem. Perhaps the device should queue some > > The protocol does not specificy the intervals during unsolicited igmp > sends. It only specifies the intervals as a result of a igmp query. RFC 3376: " To cover the possibility of the State-Change Report being missed by one or more multicast routers, it is retransmitted [Robustness Variable] - 1 more times, at intervals chosen at random from the range (0, [Unsolicited Report Interval])." and "8.11. Unsolicited Report Interval The Unsolicited Report Interval is the time between repetitions of a host's initial report of membership in a group. Default: 1 second." > The device is ready. Its just the multicast group that has not been > established yet. Well, if you know that's going to happen with your device, then again, why not queue them on start up until you have indication that the group has been established, or delay in the driver. You're changing IGMP for all device types to fix a problem in only one. > There also cannot be any storm since any unsolicited igmp report by any > system will stop the unsolicited igmp reports by any other system. Not if they are simultaneous, which is exactly when it is a problem. :-) > > > > So unsolicited reports are send for an interval of at least a minute > > > (reports are aborted if igmp reports or other info is seen). > > > > This is outside the protocol spec, and the intervals are neither > > random nor scaled based on any network performance metric. > > Where does it say that in the spec? Again this is an *unsolicited* igmp > report. See quotes above. > These are *unsolicited* igmp reports. There is *no* querier supplied data > yet. The first querier supplied data (or any other unsolicited igmp > report) will immediately stop the unsolicited reports and then will > continue to respond in randomized intervals based on the data that the > querier has supplied. There are initial values, which are currently constants, but it'd be (more) reasonable to turn those into per-interface tunables or per-interface initial values with IB interfaces using larger values. IGMP_Unsolicited_Report_Count (default 2) IGMP_Unsolicited_Report_Interval (default 10secs which is 10x larger,as you want, than the RFC suggests). +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-22 20:36 ` David Stevens @ 2010-09-22 21:26 ` Christoph Lameter 0 siblings, 0 replies; 26+ messages in thread From: Christoph Lameter @ 2010-09-22 21:26 UTC (permalink / raw) To: David Stevens; +Cc: David S. Miller, linux-rdma, netdev, Bob Arendt On Wed, 22 Sep 2010, David Stevens wrote: > > The protocol does not specificy the intervals during unsolicited igmp > > sends. It only specifies the intervals as a result of a igmp query. > > RFC 3376: > " To cover the possibility of the State-Change Report being missed by > one or more multicast routers, it is retransmitted [Robustness > Variable] - 1 more times, at intervals chosen at random from the > range (0, [Unsolicited Report Interval])." > and > "8.11. Unsolicited Report Interval > > The Unsolicited Report Interval is the time between repetitions of a > host's initial report of membership in a group. Default: 1 second." Hmmm looks like I looked at the earlier RFC 2236 3) (was not really interested in IGMP v3, IGMPv2 is run). When a host joins a multicast group, it should immediately transmit an unsolicited Version 2 Membership Report for that group, in case it is the first member of that group on the network. To cover the possibility of the initial Membership Report being lost or damaged, it is recommended that it be repeated once or twice after short delays [Unsolicited Report Interval]. (A simple way to accomplish this is to send the initial Version 2 Membership Report and then act as if a Group-Specific Query was received for that group, and set a timer appropriately). The new Unsolicited Report Interval is promising. We need to support that? > > The device is ready. Its just the multicast group that has not been > > established yet. > Well, if you know that's going to happen with your device, then > again, why not queue them on start up until you have indication that > the group has been established, or delay in the driver. > You're changing IGMP for all device types to fix a problem in > only one. > > > There also cannot be any storm since any unsolicited igmp report by any > > system will stop the unsolicited igmp reports by any other system. > > Not if they are simultaneous, which is exactly when it is a > problem. :-) But then they are not simulateneous since there is a fuzz factor. > > These are *unsolicited* igmp reports. There is *no* querier supplied > data > > yet. The first querier supplied data (or any other unsolicited igmp > > report) will immediately stop the unsolicited reports and then will > > continue to respond in randomized intervals based on the data that the > > querier has supplied. > > There are initial values, which are currently constants, but it'd > be (more) reasonable to turn those into per-interface tunables or > per-interface initial values with IB interfaces using larger values. > > IGMP_Unsolicited_Report_Count (default 2) > IGMP_Unsolicited_Report_Interval (default 10secs which is 10x larger,as > you want, than the RFC suggests). Ahhh... Interesting..... 1 second now? That is much better and would avoid long drawn out joins due to the long delays. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 2010-09-22 20:36 ` David Stevens @ 2010-09-22 21:50 ` Jason Gunthorpe 2010-09-23 15:32 ` Christoph Lameter 1 sibling, 1 reply; 26+ messages in thread From: Jason Gunthorpe @ 2010-09-22 21:50 UTC (permalink / raw) To: Christoph Lameter Cc: David Stevens, David S. Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt On Wed, Sep 22, 2010 at 02:58:15PM -0500, Christoph Lameter wrote: > > packets if it isn't ready quickly? A querier is what makes these > > reliable, but for the start-up in particular, I think it'd be better > > to not initiate the send on devices that have this problem until the > > device is actually ready to send-- why not put the delay in the device > > driver on initialization? > > The device is ready. Its just the multicast group that has not been > established yet. In IB when the SA replies to a group join the group should be ready, prior to that the device can't send into the group because it has no MLID for the group.. If you have a MLID then the group is working. Is the issue you are dropping IGMP packets because the 224.0.0.2 join hasn't finished? Ideally you'd wait for the SA to reply before sending a IGMP, but a simpler solution might just be to use the broadcast MLID for packets addressed to a MGID that has not yet got a MLID. This would bebe similar to the ethernet behaviour of flooding. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-22 21:50 ` Jason Gunthorpe @ 2010-09-23 15:32 ` Christoph Lameter 2010-09-23 17:26 ` Jason Gunthorpe 0 siblings, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-23 15:32 UTC (permalink / raw) To: Jason Gunthorpe Cc: David Stevens, David S. Miller, linux-rdma, netdev, Bob Arendt On Wed, 22 Sep 2010, Jason Gunthorpe wrote: > > The device is ready. Its just the multicast group that has not been > > established yet. > > In IB when the SA replies to a group join the group should be ready, > prior to that the device can't send into the group because it has no > MLID for the group.. If you have a MLID then the group is working. When the SA replies it has created the MLID but not reconfigured the fabric yet. So the initial IGMP messages get lost. > Is the issue you are dropping IGMP packets because the 224.0.0.2 join > hasn't finished? Ideally you'd wait for the SA to reply before sending > a IGMP, but a simpler solution might just be to use the broadcast MLID > for packets addressed to a MGID that has not yet got a MLID. This > would bebe similar to the ethernet behaviour of flooding. IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2 is only used when leaving a multicast group. I thought also about solutions along the same lines. We could modify the IB layer to send to 224.0.0.2 while until the SA has confirmed the creation of the MC group. For that to work we first would need to modify the SA logic to ensure that it only sends confirmation *after* the fabric has been reconfigured. Then we need to switch the MLIDs of the MC group when the notification is received. If the IB layer has not joined 224.0.0.2 yet (and it will take awhile) then we could even fallback to broadcast until its ready. ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports 2010-09-23 15:32 ` Christoph Lameter @ 2010-09-23 17:26 ` Jason Gunthorpe [not found] ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Jason Gunthorpe @ 2010-09-23 17:26 UTC (permalink / raw) To: Christoph Lameter; +Cc: David Stevens, linux-rdma, netdev, Bob Arendt On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote: > > Is the issue you are dropping IGMP packets because the 224.0.0.2 join > > hasn't finished? Ideally you'd wait for the SA to reply before sending > > a IGMP, but a simpler solution might just be to use the broadcast MLID > > for packets addressed to a MGID that has not yet got a MLID. This > > would bebe similar to the ethernet behaviour of flooding. > > IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2 > is only used when leaving a multicast group. Hm, that is quite different than in IGMPv3.. How does this work at all in IB? A message to the multicast group isn't going to make it to any routers unless the routers use some other means to join the IB MGID. Jason ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> @ 2010-09-23 17:37 ` Christoph Lameter [not found] ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-23 17:37 UTC (permalink / raw) To: Jason Gunthorpe Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt On Thu, 23 Sep 2010, Jason Gunthorpe wrote: > On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote: > > > > Is the issue you are dropping IGMP packets because the 224.0.0.2 join > > > hasn't finished? Ideally you'd wait for the SA to reply before sending > > > a IGMP, but a simpler solution might just be to use the broadcast MLID > > > for packets addressed to a MGID that has not yet got a MLID. This > > > would bebe similar to the ethernet behaviour of flooding. > > > > IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2 > > is only used when leaving a multicast group. > > Hm, that is quite different than in IGMPv3.. How does this work at all > in IB? A message to the multicast group isn't going to make it to any > routers unless the routers use some other means to join the IB MGID. IPoIB creates a infiniband multicast group via the IB calls for a IP multicast group. Then IGMP comes into play and the kernel sends the IP based igmp report. This igmp report must be received by an outside router (on an IP network) in order to for traffic to get forwarded into the IB fabric. You can end up with a IB multicast configuration that is all fine but with loss of the unsolicited packets due to fabric reconfiguration not being complete yet. The larger the fabric the worse the situation. If all unsolicited igmp reports are lost then the router will only start forwarding the mc group after the reporting intervals (which could be in the range of minutes) when it triggers igmp reports through a general igmp query. Until that time the MC group looks dead. And people and software may conclude that the **** network is broken. This is a general issue for any network where configurations for MC forwarding is needed and where initial igmp reports may get lost. A staggering of time intervals would be a general solution to that issue. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-23 17:46 ` Jason Gunthorpe [not found] ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> 2010-09-27 19:32 ` David Stevens 1 sibling, 1 reply; 26+ messages in thread From: Jason Gunthorpe @ 2010-09-23 17:46 UTC (permalink / raw) To: Christoph Lameter Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt On Thu, Sep 23, 2010 at 12:37:28PM -0500, Christoph Lameter wrote: > On Thu, 23 Sep 2010, Jason Gunthorpe wrote: > > > On Thu, Sep 23, 2010 at 10:32:17AM -0500, Christoph Lameter wrote: > > > > > > Is the issue you are dropping IGMP packets because the 224.0.0.2 join > > > > hasn't finished? Ideally you'd wait for the SA to reply before sending > > > > a IGMP, but a simpler solution might just be to use the broadcast MLID > > > > for packets addressed to a MGID that has not yet got a MLID. This > > > > would bebe similar to the ethernet behaviour of flooding. > > > > > > IGMP reports are sent on the multicast group not on 224.0.0.2. 224.0.0.2 > > > is only used when leaving a multicast group. > > > > Hm, that is quite different than in IGMPv3.. How does this work at all > > in IB? A message to the multicast group isn't going to make it to any > > routers unless the routers use some other means to join the IB MGID. > > IPoIB creates a infiniband multicast group via the IB calls for a IP > multicast group. Then IGMP comes into play and the kernel sends the IP > based igmp report. This igmp report must be received by an outside router > (on an IP network) in order to for traffic to get forwarded into the IB > fabric. You can end up with a IB multicast configuration that is all fine > but with loss of the unsolicited packets due to fabric reconfiguration not > being complete yet. The larger the fabric the worse the situation. But my point is that IB has very limited multicast, if I create a IB group and then send IGMP into that group *it will not reach a router*. I have to send something to the all routers group or the all IGMPv3 group to get it to reach a router with any reliably. The only way this kind of scheme could work is if an IGMPv2 IPoIB router listens for IB MGID Create notices from the SA and automatically joins all groups that are created, so it can get IGMPv2 membership reports. Which obviously adds more delay, lag, and risk. I'm *guessing* that the change in IGMPv3 to send reports to 224.0.0.22 (all IGMPv3 multicast address) is related to this sort of problem, and it seems like on IB IGMPv2 is not a good fit and should not be used if v3 is available.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>]
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> @ 2010-09-23 17:56 ` Christoph Lameter 0 siblings, 0 replies; 26+ messages in thread From: Christoph Lameter @ 2010-09-23 17:56 UTC (permalink / raw) To: Jason Gunthorpe Cc: David Stevens, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt On Thu, 23 Sep 2010, Jason Gunthorpe wrote: > > IPoIB creates a infiniband multicast group via the IB calls for a IP > > multicast group. Then IGMP comes into play and the kernel sends the IP > > based igmp report. This igmp report must be received by an outside router > > (on an IP network) in order to for traffic to get forwarded into the IB > > fabric. You can end up with a IB multicast configuration that is all fine > > but with loss of the unsolicited packets due to fabric reconfiguration not > > being complete yet. The larger the fabric the worse the situation. > > But my point is that IB has very limited multicast, if I create a IB > group and then send IGMP into that group *it will not reach a router*. The IPoIB routers automatically join all IP MC groups created. > The only way this kind of scheme could work is if an IGMPv2 IPoIB > router listens for IB MGID Create notices from the SA and > automatically joins all groups that are created, so it can get IGMPv2 > membership reports. Which obviously adds more delay, lag, and risk. Right that is how it works now. > I'm *guessing* that the change in IGMPv3 to send reports to 224.0.0.22 > (all IGMPv3 multicast address) is related to this sort of problem, and > it seems like on IB IGMPv2 is not a good fit and should not be used if > v3 is available.. Existing routers do no support IGMPv3. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Staggered igmp report intervals for unsolicited igmp reports [not found] ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 2010-09-23 17:46 ` Jason Gunthorpe @ 2010-09-27 19:32 ` David Stevens 1 sibling, 0 replies; 26+ messages in thread From: David Stevens @ 2010-09-27 19:32 UTC (permalink / raw) To: Christoph Lameter Cc: Jason Gunthorpe, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, Bob Arendt Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/23/2010 10:37:28 AM: > > If all unsolicited igmp reports are lost then the router will > only start forwarding the mc group after the reporting intervals > (which could be in the range of minutes) when it triggers igmp reports > through a general igmp query. Until that time the MC group looks dead. And > people and software may conclude that the **** network is broken. You can, of course, add a querier (or configure it, assuming an attached switch supports it) and set the query interval and robustness count as appropriate for that network. > This is a general issue for any network where configurations for MC > forwarding is needed and where initial igmp reports may get lost. Meaning "IB-only", right? :-) Maybe other NBMA networks too, but certainly not a typical problem for typical networks (i.e., Ethernet). > A staggering of time intervals would be a general solution to that issue. As would be having those networks queue packets for hardware addresses they know require a delay before a transmit can complete. But that approach can't adversely affect already-working solutions for typical networks, or depart unnecessarily from established standard protocols. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 2010-09-22 19:01 ` igmp: Staggered igmp report intervals for unsolicited igmp reports Christoph Lameter @ 2010-09-24 4:38 ` David Miller [not found] ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> 1 sibling, 1 reply; 26+ messages in thread From: David Miller @ 2010-09-24 4:38 UTC (permalink / raw) To: cl-vYTEC60ixJUAvxtiuMwx3w Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA, dlstevens-r/Jw6+rmf7HQT0dZR+AlfA From: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> Date: Wed, 22 Sep 2010 13:59:30 -0500 (CDT) > IGMP timers sometimes fire too rapidly due to randomization of the > intervalsfrom 0 to max_delay in igmp_start_timer(). ... > Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> This change seems reasonable to me, what do you think David? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>]
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> @ 2010-09-27 17:41 ` David Stevens [not found] ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-27 17:41 UTC (permalink / raw) To: David Miller Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote on 09/23/2010 09:38:23 PM: > > > IGMP timers sometimes fire too rapidly due to randomization of the > > intervalsfrom 0 to max_delay in igmp_start_timer(). > ... > > Signed-off-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> > > > This change seems reasonable to me, what do you think David? [sorry for the delay -- I was off-line for the last few days] Dave, I don't know if you saw the more extended discussion we had on this or not, but I think while this would help for IB, it's not appropriate in general. These can in fact be "0" per RFC which is worst case for IB if there is a delay for being able to use the group, and the newer IGMPv3 standard has shortened the max interval from 10sec in v2 to 1 sec. Fundamentally, the problem is that the device needs to be able to send on the group immediately for IGMP; that it can't for IB is the problem, and I think it should be solved in IB by either queueing packets there or delaying there as needed before doing the joins. I don't think tweaking IGMP for this is appropriate at all, but if done there, it ought to be per-interface so it doesn't change anything for other network types which don't have this problem. It should be randomized and not the fixed delays to prevent storms on a mass start-up, and we also don't want to be increasing the number of duplicates for other network types. The default should be 2 reports in randomized 0-10 sec for each for v2, 2 in randomized 0-1 sec for v3. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2010-09-27 17:54 ` David Miller [not found] ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: David Miller @ 2010-09-27 17:54 UTC (permalink / raw) To: dlstevens-r/Jw6+rmf7HQT0dZR+AlfA Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA From: David Stevens <dlstevens-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> Date: Mon, 27 Sep 2010 10:41:20 -0700 > I don't know if you saw the more extended discussion we > had on this or not, but I think while this would help for IB, > it's not appropriate in general. These can in fact be "0" per > RFC which is worst case for IB if there is a delay for being > able to use the group, and the newer IGMPv3 standard has shortened > the max interval from 10sec in v2 to 1 sec. I did see the extended discussion, and it was interesting :-) But that mainly focused on the second patch, which I appropriately marked as needing changes in patchwork. This patch on the other hand is attacking a different problem, namely avoiding the worst cases caused by the randomization we do for the timer. With bad luck this thing times out way too fast because the total of all of the randomized intervals can end up being very small, and I think we should fix that independently of the other issues hit by the IB folks. Don't you agree? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>]
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> @ 2010-09-27 18:16 ` David Stevens 2010-09-27 19:55 ` David Stevens 1 sibling, 0 replies; 26+ messages in thread From: David Stevens @ 2010-09-27 18:16 UTC (permalink / raw) To: David Miller Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org wrote on 09/27/2010 10:54:44 AM: > From: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> > To: David Stevens/Beaverton/IBM@IBMUS > Cc: cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, > netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, rda-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org > Date: 09/27/2010 10:54 AM > Subject: Re: igmp: Allow mininum interval specification for igmp timers. > Sent by: netdev-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > > From: David Stevens <dlstevens-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> > Date: Mon, 27 Sep 2010 10:41:20 -0700 > > > I don't know if you saw the more extended discussion we > > had on this or not, but I think while this would help for IB, > > it's not appropriate in general. These can in fact be "0" per > > RFC which is worst case for IB if there is a delay for being > > able to use the group, and the newer IGMPv3 standard has shortened > > the max interval from 10sec in v2 to 1 sec. > > I did see the extended discussion, and it was interesting :-) > > But that mainly focused on the second patch, which I appropriately > marked as needing changes in patchwork. OK, I'm not sure I've seen them all; haven't caught up on e-mail yet. > This patch on the other hand is attacking a different problem, > namely avoiding the worst cases caused by the randomization we > do for the timer. I think the multiples are to allow for drops and the randomization is to prevent storms. As far as IGMP is concerned, it's perfectly fine to send them back-to-back, since drops are not necessarily time periods of network outage (as with IB) but rather transient queue overflows where even the short delay of a "0" timer but still having protocol and packet transmit delay would be fine. > With bad luck this thing times out way too fast because the total of > all of the randomized intervals can end up being very small, and I > think we should fix that independently of the other issues hit by the > IB folks. > > Don't you agree? If you mean enforcing a minimum spacing higher than a "0" timer, I don't know that it's an issue for other network types. According to IGMPv3, all of them (3 total) on average would be sent in 1 sec, but it also isn't fatal to drop all of them. To the extent that 1 sec is "small," it is intentional. I'll try digging out the particular patch and comment. I'm not sure many of these tweaks would necessarily hurt other network types but I think the current code also isn't a problem for anything but IB, and that issue can be fixed within IB. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> 2010-09-27 18:16 ` David Stevens @ 2010-09-27 19:55 ` David Stevens 2010-09-27 20:20 ` Christoph Lameter 1 sibling, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-27 19:55 UTC (permalink / raw) To: David Miller Cc: cl-vYTEC60ixJUAvxtiuMwx3w, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> wrote on 09/27/2010 10:54:44 AM: > This patch on the other hand is attacking a different problem, > namely avoiding the worst cases caused by the randomization we > do for the timer. > > With bad luck this thing times out way too fast because the total of > all of the randomized intervals can end up being very small, and I > think we should fix that independently of the other issues hit by the > IB folks. I think I'm caught up on the discussion now. For IGMPv3, we would send all the reports always in < 2 secs, and the average would be < 1 sec, so I'm not sure any sort of tweaks we do to enforce a minimum randomized interval are compatible with IGMPv3 and still solve IB's problem. As I said before, I think per protocol, back-to-back is both allowed and not a problem, even if both subsequent randomized reports come out to 0 time. But if we wanted to enforce a minimum interval of, say, X, then I think the better way to do that is to set the timer to X + rand(Interval-X) and not a table of fixed intervals as in the original patch. For v2, X=1 or 2 sec and Interval=10 might work well, but for v3, the entire interval is 1 sec and I think I saw that the set-up time for the fabric may be on the order of 1 sec. I also don't think that we want those kinds of delays on Ethernet. A program may join and send lots of traffic in 1 sec, and if the immediate join is lost, one of the quickly-following <1 sec duplicate reports will make it recover and work. Delaying the minimum would guarantee it wouldn't work until that minimum and drop all that traffic if the immediate report is lost, then. Really, of course, I think the solution belongs in IB, but if we did anything in IGMP, I'd prefer it were a per-interface tunable that defaults as in the RFC. Since you can change the interval and # of reports through a querier now, exporting the default values of (10,2) for v2 and (1,2) for v3 to instead be per-interface tunables and then bumped as needed for IB would allow tweaking without running a querier. But a querier that's using default values would also override that and cause the problem all over again. Queuing in the driver until the MAC address is usable solves it generally. Also, MLD and IPv6 will have all these same issues, and working multicasting is even more important there. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: igmp: Allow mininum interval specification for igmp timers. 2010-09-27 19:55 ` David Stevens @ 2010-09-27 20:20 ` Christoph Lameter [not found] ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: Christoph Lameter @ 2010-09-27 20:20 UTC (permalink / raw) To: David Stevens; +Cc: David Miller, linux-rdma, netdev, netdev-owner, rda On Mon, 27 Sep 2010, David Stevens wrote: > > With bad luck this thing times out way too fast because the total of > > all of the randomized intervals can end up being very small, and I > > think we should fix that independently of the other issues hit by the > > IB folks. > > I think I'm caught up on the discussion now. For IGMPv3, we > would send all the reports always in < 2 secs, and the average would > be < 1 sec, so I'm not sure any sort of tweaks we do to enforce a > minimum randomized interval are compatible with IGMPv3 and still > solve IB's problem. Ok thanks for the effort but so far I do not see you having caught up. I'd rather avoid responding to the misleading statements you made in other replies and just respond to where you missed the boat here. > As I said before, I think per protocol, back-to-back is both > allowed and not a problem, even if both subsequent randomized reports > come out to 0 time. But if we wanted to enforce a minimum interval > of, say, X, then I think the better way to do that is to set the > timer to X + rand(Interval-X) and not a table of fixed intervals The second patch sets the intervals to X .. X + Rand (interval) and not to a table of fixed intervals as you state here. I have pointed this out before. > as in the original patch. For v2, X=1 or 2 sec and Interval=10 > might work well, but for v3, the entire interval is 1 sec and I > think I saw that the set-up time for the fabric may be on the > order of 1 sec. Again there is no knowledge about V2 or V3 without a query and this is during the period when no querier is known yet. You stated elsewhere that I can assume V3 by default? So 1 sec? > I also don't think that we want those kinds of delays on > Ethernet. A program may join and send lots of traffic in 1 sec, > and if the immediate join is lost, one of the quickly-following > <1 sec duplicate reports will make it recover and work. Delaying > the minimum would guarantee it wouldn't work until that minimum > and drop all that traffic if the immediate report is lost, then. There can be any number of reasons that a short outage could prevent the packets from going through. A buffer overrun (that you mentioned elsewhere) usually causes lots of packets to be lost. Buffer overrun scenarios usually mean that all igmp queries are lost. > Really, of course, I think the solution belongs in IB, but > if we did anything in IGMP, I'd prefer it were a per-interface > tunable that defaults as in the RFC. Since you can change the > interval and # of reports through a querier now, exporting the > default values of (10,2) for v2 and (1,2) for v3 to instead be > per-interface tunables and then bumped as needed for IB would > allow tweaking without running a querier. But a querier that's > using default values would also override that and cause the > problem all over again. Queuing in the driver until the MAC > address is usable solves it generally. There is no solution on the IB layer since there is no notification when the fabric reconfiguration necessary for an multicast group is complete. The querier is of not use since (for the gazillionth of times) this is an unsolicited IGMP report. If there is a querier then the unsolicited igmp reports would not be used but the timeout indicated by the querier would be used. ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>]
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org> @ 2010-09-27 21:45 ` David Stevens [not found] ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> 0 siblings, 1 reply; 26+ messages in thread From: David Stevens @ 2010-09-27 21:45 UTC (permalink / raw) To: Christoph Lameter Cc: David Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org> wrote on 09/27/2010 01:20:54 PM: > > > As I said before, I think per protocol, back-to-back is both > > allowed and not a problem, even if both subsequent randomized reports > > come out to 0 time. But if we wanted to enforce a minimum interval > > of, say, X, then I think the better way to do that is to set the > > timer to X + rand(Interval-X) and not a table of fixed intervals > > The second patch sets the intervals to X .. X + Rand (interval) and not to > a table of fixed intervals as you state here. I have pointed this out > before. Sorry if I've misunderstood something you're proposing, but what you describe above would be certainly technically incorrect. There are really no circumstances for sending a report greater than <Interval> that is protocol-compliant. You can enforce a minimum greater than 0, which is a departure from both RFCs, though IGMPv2 uses wishy-washy language. The intent for both was to explicitly allow 0, IMO. > > > as in the original patch. For v2, X=1 or 2 sec and Interval=10 > > might work well, but for v3, the entire interval is 1 sec and I > > think I saw that the set-up time for the fabric may be on the > > order of 1 sec. > > Again there is no knowledge about V2 or V3 without a query and this is > during the period when no querier is known yet. You stated elsewhere that > I can assume V3 by default? So 1 sec? Yes, without a querier or the tunable to force it to IGMPv2, the default is IGMPv3. It appears there is a bug where IGMPv3 is also using a 10sec interval (haven't verified that), but a 1 sec interval as required makes your situation worse, not better. It makes it even more likely that all the initial reports will occur before your set-up is done. > There can be any number of reasons that a short outage could prevent the > packets from going through. A buffer overrun (that you mentioned > elsewhere) usually causes lots of packets to be lost. Buffer overrun > scenarios usually mean that all igmp queries are lost. You're arguing against protocol compliance. I didn't define the protocol, I only implemented it. And your view is through the IB lens, but I don't believe this is an actual problem in any way for typical networks. If you wrote a standards-track RFC that modifies IGMP for NBMA networks that require a delay or different parameters there, I'd have no objection to implementing that. Unilaterally changing linux's behavior on all network types without cause for departing from RFC on the most common types is another matter. > There is no solution on the IB layer since there is no notification when > the fabric reconfiguration necessary for an multicast group is complete. Certainly that's not true; without notification, you can queue for first use of a new hardware multicast address and send the queue after an appropriate delay (1 sec? If that covers your set-up time). If you had positive acknowledgement from the IB network, you'd know exactly when to do it, but there's no need to change anything for non-IB networks here. > The querier is of not use since (for the gazillionth of times) this is an > unsolicited IGMP report. If there is a querier then the unsolicited igmp > reports would not be used but the timeout indicated by the querier would > be used. A querier affects unsolicited reports because it sets both the query interval and the robustness value. If you want to send 10 reports, you can cause that by having a querier that sets it to that many. The initial join would then send 10 reports and the query interval can also be as low as you like. But the linux code is not just for your particular problem or particular configuration. You can solve your problem by adding a querier, but I know you're trying to do it without. The mail I was responding to referred also to the case of a querier present, which is actually the "normal" case for using full IGMP is. I'm saying that for the non-querier case, making those per-interface configurable is reasonable because they *are* querier-changeable, but you can also use a querier to change it _for_the_unsolicited_reports_, as well as making the querier interval small enough that you don't have to care at all whether any or all of the unsolicited reports are lost. +-DLS -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
[parent not found: <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>]
* Re: igmp: Allow mininum interval specification for igmp timers. [not found] ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org> @ 2010-09-28 18:42 ` Christoph Lameter 0 siblings, 0 replies; 26+ messages in thread From: Christoph Lameter @ 2010-09-28 18:42 UTC (permalink / raw) To: David Stevens Cc: David Miller, linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA, netdev-owner-u79uwXL29TY76Z2rM5mHXA, rda-x0S3BwdUo6DQT0dZR+AlfA On Mon, 27 Sep 2010, David Stevens wrote: > > The second patch sets the intervals to X .. X + Rand (interval) and not > to > > a table of fixed intervals as you state here. I have pointed this out > > before. > > Sorry if I've misunderstood something you're proposing, but what > you describe above would be certainly technically incorrect. There are > really no circumstances for sending a report greater than <Interval> > that is protocol-compliant. You can enforce a minimum greater than 0, > which is a departure from both RFCs, though IGMPv2 uses wishy-washy > language. The intent for both was to explicitly allow 0, IMO. There is no igmp interval set by any igmp query yet so this is your usual unresponsive crappy response to something else that we are not talking about. I thought you were talking about the "fixed intervals" that you saw in the patch. These initial intervals are for unsolicited igmp reports (do I need to add that statement to every sentence in a thread where we *only* discuss unsolicited igmp issues?) and those "intervals" are randomized and not fixed. > > > as in the original patch. For v2, X=1 or 2 sec and Interval=10 > > > might work well, but for v3, the entire interval is 1 sec and I > > > think I saw that the set-up time for the fabric may be on the > > > order of 1 sec. > > > > Again there is no knowledge about V2 or V3 without a query and this is > > during the period when no querier is known yet. You stated elsewhere > that > > I can assume V3 by default? So 1 sec? > > Yes, without a querier or the tunable to force it to IGMPv2, > the default is IGMPv3. It appears there is a bug where IGMPv3 is also > using a 10sec interval (haven't verified that), but a 1 sec interval You do not have the linux source code tree available? from net/ipv4/igmp.c: #define IGMP_Unsolicited_Report_Interval (10*HZ) > as required makes your situation worse, not better. It makes it even > more likely that all the initial reports will occur before your set-up > is done. Right. So can you please give me an approach that considers all these issues and does not invent problem that do not exist, stays within the subject discussed and follows the RFCs? > > There can be any number of reasons that a short outage could prevent the > > packets from going through. A buffer overrun (that you mentioned > > elsewhere) usually causes lots of packets to be lost. Buffer overrun > > scenarios usually mean that all igmp queries are lost. > > You're arguing against protocol compliance. I didn't define > the protocol, I only implemented it. And your view is through the > IB lens, but I don't believe this is an actual problem in any way > for typical networks. If you wrote a standards-track RFC that modifies > IGMP for NBMA networks that require a delay or different parameters > there, I'd have no objection to implementing that. Unilaterally > changing linux's behavior on all network types without cause for > departing from RFC on the most common types is another matter. The RFCs state that the igmp queries have to be repeated at least 3 times. The first patch ensures that a mininum time passes between two igmp reports (to avoid them getting lost in one go). The second patch doubles the number of igmp reports and increases the intervals so that we still have a chance to process the join before the next igmp query is send by the router (which can be minuates away). It fixes buggy havior that we see because multicast joins take very long or fail outright. > > There is no solution on the IB layer since there is no notification when > > the fabric reconfiguration necessary for an multicast group is complete. > > Certainly that's not true; without notification, you can queue for > first use of a new hardware multicast address and send the queue after an > appropriate delay (1 sec? If that covers your set-up time). If you had > positive acknowledgement from the IB network, you'd know exactly when to > do it, but there's no need to change anything for non-IB networks here. So you want an arbitrary delay for all new multicast traffic to be created? I'd rather have a series of staggered attempts so that we can avoid this setup time. Also the setup time varies greatly depending on the complexity of the fabric changes. It can be extremely fast if the multicast group is already in use by others in the fabric. Adding a delay penalizes everyone unnecessarily. Also much of these seems to be contigent on IGMPv3. We are using igmpv2. > > The querier is of not use since (for the gazillionth of times) this is > an > > unsolicited IGMP report. If there is a querier then the unsolicited igmp > > reports would not be used but the timeout indicated by the querier would > > be used. > > A querier affects unsolicited reports because it sets both the > query interval and the robustness value. If you want to send 10 reports, > you can cause that by having a querier that sets it to that many. The > initial join would then send 10 reports and the query interval can also > be as low as you like. The Linux IGMP subsystem does not support either of those at this point. When the multicast group is created it has no notion of query intervals until the first igmp query is received. That is the period of interest that we are discussing! > But the linux code is not just for your particular problem or > particular configuration. You can solve your problem by adding a querier, > but I know you're trying to do it without. The mail I was responding to > referred also to the case of a querier present, which is actually the > "normal" case for using full IGMP is. I'm saying that for the non-querier > case, making those per-interface configurable is reasonable because > they *are* querier-changeable, but you can also use a querier to change > it _for_the_unsolicited_reports_, as well as making the querier interval > small enough that you don't have to care at all whether any or all of > the unsolicited reports are lost. The network of course has a querier that sents igmp requests in intervals that could span minutes. We are talking about the period of time after a join when a multicast group has been created but we have not reached the time when the router sends an igmp query and where the various bits of information about the igmp handling can be determined for the multicast group. I am not sure that you comprehend the basics of IGMP processing in the kernel. I see knowledge about IGMP in general but you have a difficult time to relate that to what the Linux kernel actually does. Would you please have a look at the source code? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 26+ messages in thread
end of thread, other threads:[~2010-09-28 18:42 UTC | newest]
Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-09-22 18:59 igmp: Allow mininum interval specification for igmp timers Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009221354410.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 19:01 ` igmp: Staggered igmp report intervals for unsolicited igmp reports Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009221400010.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 19:30 ` David Stevens
[not found] ` <OFF06BBC88.0B6755D5-ON882577A6.0068F4F8-882577A6.006B31FB-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-22 19:58 ` Christoph Lameter
2010-09-22 20:56 ` Bob Arendt
[not found] ` <4C9A6D87.2000103-x0S3BwdUo6DQT0dZR+AlfA@public.gmane.org>
2010-09-22 21:33 ` Christoph Lameter
2010-09-22 21:41 ` David Stevens
2010-09-23 15:37 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009231021080.32567-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-27 19:24 ` David Stevens
[not found] ` <alpine.DEB.2.00.1009221448460.32661-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-22 20:36 ` David Stevens
2010-09-22 21:26 ` Christoph Lameter
2010-09-22 21:50 ` Jason Gunthorpe
2010-09-23 15:32 ` Christoph Lameter
2010-09-23 17:26 ` Jason Gunthorpe
[not found] ` <20100923172603.GM11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-09-23 17:37 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009231230100.2962-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-23 17:46 ` Jason Gunthorpe
[not found] ` <20100923174614.GN11157-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-09-23 17:56 ` Christoph Lameter
2010-09-27 19:32 ` David Stevens
2010-09-24 4:38 ` igmp: Allow mininum interval specification for igmp timers David Miller
[not found] ` <20100923.213823.137834706.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2010-09-27 17:41 ` David Stevens
[not found] ` <OF4BA8F9C2.467056E9-ON882577AB.005F4832-882577AB.00612DD6-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-27 17:54 ` David Miller
[not found] ` <20100927.105444.214208865.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2010-09-27 18:16 ` David Stevens
2010-09-27 19:55 ` David Stevens
2010-09-27 20:20 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1009271503420.14117-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-09-27 21:45 ` David Stevens
[not found] ` <OF4DDFA464.A933254C-ON882577AB.00754000-882577AB.00778C77-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2010-09-28 18:42 ` Christoph Lameter
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).