* [PATCH] Make multicast and path record queue flexible.
@ 2010-10-05 16:07 Aleksey Senin
[not found] ` <4CAB4D49.9090107-smomgflXvOZWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Aleksey Senin @ 2010-10-05 16:07 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
Cc: ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
When using slow SM allow more packets to be buffered before answer
comming back. This patch based on idea of Christoph Lameter.
http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
Signed-off-by: Aleksey Senin <alekseys-smomgflXvOZWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/ulp/ipoib/ipoib.h | 2 +
drivers/infiniband/ulp/ipoib/ipoib_main.c | 91 +++++++++++++++++++++++-
drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 +-
3 files changed, 91 insertions(+), 4 deletions(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 753a983..159e29c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -698,6 +698,8 @@ static inline void ipoib_unregister_debugfs(void) { }
extern int ipoib_sendq_size;
extern int ipoib_recvq_size;
+extern unsigned int ipoib_prec_qlen;
+extern unsigned int ipoib_mcast_qlen;
extern struct ib_sa_client ipoib_sa_client;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index b4b2257..7101e0d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -69,6 +69,85 @@ module_param(lro_max_aggr, int, 0644);
MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
"(default = 64)");
+unsigned int ipoib_prec_qlen = IPOIB_MAX_PATH_REC_QUEUE;
+unsigned int ipoib_mcast_qlen = IPOIB_MAX_MCAST_QUEUE;
+
+static struct ctl_table_header *ipoib_table_header;
+
+#define MIN_IPOIB_QLENGTH 1
+#define MAX_IPOIB_QLENGTH 256
+
+static unsigned int min_ipoib_qlen = MIN_IPOIB_QLENGTH;
+static unsigned int max_ipoib_qlen = MAX_IPOIB_QLENGTH;
+
+static ctl_table ipoib_tunable_table[] = {
+ {
+ .procname = "prec_qlen",
+ .data = &ipoib_prec_qlen,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ipoib_qlen,
+ .extra2 = &max_ipoib_qlen
+ },
+ {
+ .procname = "mcast_qlen",
+ .data = &ipoib_mcast_qlen,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_ipoib_qlen,
+ .extra2 = &max_ipoib_qlen
+ },
+ {},
+};
+
+static ctl_table ipoib_table[] = {
+ {
+ .procname = "ib_ipoib",
+ .mode = 0555,
+ .maxlen = 0,
+ .child = ipoib_tunable_table
+ },
+ {},
+};
+
+static int param_set_uint_minmax(const char *val,
+ const struct kernel_param *kp,
+ unsigned int min, unsigned int max)
+{
+ unsigned long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = strict_strtoul(val, 0, &num);
+ if (ret == -EINVAL || num < min || num > max)
+ return -EINVAL;
+ *((unsigned int *)kp->arg) = num;
+ return 0;
+}
+
+static int param_set_queue_length(const char *val,
+ const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp,\
+ MIN_IPOIB_QLENGTH, MAX_IPOIB_QLENGTH);
+}
+
+static struct kernel_param_ops param_ops_queue_length = {
+ .set = param_set_queue_length,
+ .get = param_get_uint,
+};
+
+#define param_check_queue_length(name, p) \
+ __param_check(name, p, unsigned int);
+
+module_param_named(prec_qlen, ipoib_prec_qlen, queue_length, 0644);
+MODULE_PARM_DESC(prec_qlen, "Path record queue length ([1..256], default = 3)");
+module_param_named(mcast_qlen, ipoib_mcast_qlen, queue_length, 0644);
+MODULE_PARM_DESC(mcast_qlen, "Multicast queue length ([1...256], default = 3)");
+
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
@@ -597,7 +676,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
ipoib_neigh_free(dev, neigh);
goto err_drop;
}
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+ if (skb_queue_len(&neigh->queue) < ipoib_prec_qlen)
__skb_queue_tail(&neigh->queue, skb);
else {
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
@@ -695,7 +774,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
return;
} else if ((path->query || !path_rec_start(dev, path)) &&
- skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ skb_queue_len(&path->queue) < ipoib_prec_qlen) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof *phdr);
__skb_queue_tail(&path->queue, skb);
@@ -752,7 +831,7 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
- if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+ if (skb_queue_len(&neigh->queue) < ipoib_prec_qlen) {
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
@@ -1425,6 +1504,8 @@ static int __init ipoib_init_module(void)
if (ret)
return ret;
+ if (!ipoib_table_header)
+ ipoib_table_header = register_sysctl_table(ipoib_table);
/*
* We create our own workqueue mainly because we want to be
* able to flush it when devices are being removed. We can't
@@ -1461,6 +1542,10 @@ static void __exit ipoib_cleanup_module(void)
{
ib_unregister_client(&ipoib_client);
ib_sa_unregister_client(&ipoib_sa_client);
+ if (ipoib_table_header) {
+ unregister_sysctl_table(ipoib_table_header);
+ ipoib_table_header = NULL;
+ }
ipoib_unregister_debugfs();
destroy_workqueue(ipoib_workqueue);
}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3871ac6..1f2d28e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -693,7 +693,7 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb)
}
if (!mcast->ah) {
- if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+ if (skb_queue_len(&mcast->pkt_queue) < ipoib_mcast_qlen)
skb_queue_tail(&mcast->pkt_queue, skb);
else {
++dev->stats.tx_dropped;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <4CAB4D49.9090107-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2010-10-05 16:28 ` Jason Gunthorpe
[not found] ` <20101005162833.GC5967-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 19:18 ` Christoph Lameter
1 sibling, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-05 16:28 UTC (permalink / raw)
To: Aleksey Senin
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, Oct 05, 2010 at 06:07:37PM +0200, Aleksey Senin wrote:
> When using slow SM allow more packets to be buffered before answer
> comming back. This patch based on idea of Christoph Lameter.
>
> http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
IMHO, I think it is better to send multicasts to the broadcast MLID than to
queue them.. More like ethernet that way.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <20101005162833.GC5967-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-10-05 19:12 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051410370.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 19:12 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
> On Tue, Oct 05, 2010 at 06:07:37PM +0200, Aleksey Senin wrote:
> > When using slow SM allow more packets to be buffered before answer
> > comming back. This patch based on idea of Christoph Lameter.
> >
> > http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
>
> IMHO, I think it is better to send multicasts to the broadcast MLID than to
> queue them.. More like ethernet that way.
I agree. We had similar ideas. However, the kernel does send igmp
reports to the MC address not to 244.0.0.2. We would have to redirect at
the IB layer until multicast via MLID becomes functional. We cannot tell
when that will be the case.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <4CAB4D49.9090107-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2010-10-05 16:28 ` Jason Gunthorpe
@ 2010-10-05 19:18 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051415100.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
1 sibling, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 19:18 UTC (permalink / raw)
To: Aleksey Senin
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Aleksey Senin wrote:
> When using slow SM allow more packets to be buffered before answer
> comming back. This patch based on idea of Christoph Lameter.
I agree, I think we need to have those things configurable.
Reviewed-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
How do you handle the situation of the SM responding before the fabric has
been reconfigured? I do not see any delay on join. So they will be dropped
if the fabric was not reconfigured fast enough? Or does the SM somehow
delay the response?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051410370.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-05 19:54 ` Jason Gunthorpe
[not found] ` <20101005195428.GB24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-12 16:29 ` Alekseys Senin
1 sibling, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-05 19:54 UTC (permalink / raw)
To: Christoph Lameter
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, Oct 05, 2010 at 02:12:57PM -0500, Christoph Lameter wrote:
> On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
>
> > On Tue, Oct 05, 2010 at 06:07:37PM +0200, Aleksey Senin wrote:
> > > When using slow SM allow more packets to be buffered before answer
> > > comming back. This patch based on idea of Christoph Lameter.
> > >
> > > http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
> >
> > IMHO, I think it is better to send multicasts to the broadcast MLID than to
> > queue them.. More like ethernet that way.
>
> I agree. We had similar ideas. However, the kernel does send igmp
> reports to the MC address not to 244.0.0.2. We would have to redirect at
> the IB layer until multicast via MLID becomes functional. We cannot tell
> when that will be the case.
Sure, but Aleksey's patch is aimed at the case when the SM has not yet
replied, not for your problem with IGMPv2. If their is no MLID then
sending to the broadcast MLID is a better choice than hanging onto the
packets. I wonder if you could even send unicasts to the broadcast?
I still think the problem you have with IGMPv2 is best solved by
leaning on the gateway vendors to support IGMPv3 - which *does* send
all reports to 244.0.0.22
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <20101005195428.GB24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-10-05 20:02 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051500330.8786-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 20:02 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
> > I agree. We had similar ideas. However, the kernel does send igmp
> > reports to the MC address not to 244.0.0.2. We would have to redirect at
> > the IB layer until multicast via MLID becomes functional. We cannot tell
> > when that will be the case.
>
> Sure, but Aleksey's patch is aimed at the case when the SM has not yet
> replied, not for your problem with IGMPv2. If their is no MLID then
> sending to the broadcast MLID is a better choice than hanging onto the
> packets. I wonder if you could even send unicasts to the broadcast?
The problem that the SM has not yet replied is no different between the
IGMP versions. If you get a confirmation but the MC group is not
functional then packets go nowhere.
> I still think the problem you have with IGMPv2 is best solved by
> leaning on the gateway vendors to support IGMPv3 - which *does* send
> all reports to 244.0.0.22
s/22/2
Certainly a solution for the igmp messages themselves but not for
initial traffic or traffic send via sendonly join.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051500330.8786-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-05 20:21 ` Jason Gunthorpe
[not found] ` <20101005202121.GC24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-05 20:21 UTC (permalink / raw)
To: Christoph Lameter
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, Oct 05, 2010 at 03:02:21PM -0500, Christoph Lameter wrote:
> On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
>
> > > I agree. We had similar ideas. However, the kernel does send igmp
> > > reports to the MC address not to 244.0.0.2. We would have to redirect at
> > > the IB layer until multicast via MLID becomes functional. We cannot tell
> > > when that will be the case.
> >
> > Sure, but Aleksey's patch is aimed at the case when the SM has not yet
> > replied, not for your problem with IGMPv2. If their is no MLID then
> > sending to the broadcast MLID is a better choice than hanging onto the
> > packets. I wonder if you could even send unicasts to the broadcast?
>
> The problem that the SM has not yet replied is no different between the
> IGMP versions. If you get a confirmation but the MC group is not
> functional then packets go nowhere.
Getting a MLID that is not 'functional' is a different problem. Aleksey
is looking at the case when there is no MLID at all, and I think
queuing is the wrong approach.
> > I still think the problem you have with IGMPv2 is best solved by
> > leaning on the gateway vendors to support IGMPv3 - which *does* send
> > all reports to 244.0.0.22
>
> s/22/2
No, 22. RFC 3376:
4.2.14. IP Destination Addresses for Reports
Version 3 Reports are sent with an IP destination address of
224.0.0.22, to which all IGMPv3-capable multicast routers listen. A
system that is operating in version 1 or version 2 compatibility
modes sends version 1 or version 2 Reports to the multicast group
specified in the Group Address field of the Report. In addition, a
system MUST accept and process any version 1 or version 2 Report
whose IP Destination Address field contains *any* of the addresses
(unicast or multicast) assigned to the interface on which the Report
arrives.
> Certainly a solution for the igmp messages themselves but not for
> initial traffic or traffic send via sendonly join.
Using .22 will generally solve the problems with sychronizing the
IPoIB gateway to the state of the IGMPv3 clients. Yes, there will
still be some unknown lag in building the IB side of the network and
for the router(s) to get ready to handle the group - but at least it
is no longer dependent on any timeouts.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <20101005202121.GC24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-10-05 20:43 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051535480.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 20:43 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
> > > I still think the problem you have with IGMPv2 is best solved by
> > > leaning on the gateway vendors to support IGMPv3 - which *does* send
> > > all reports to 244.0.0.22
> >
> > s/22/2
>
> No, 22. RFC 3376:
>
> 4.2.14. IP Destination Addresses for Reports
>
> Version 3 Reports are sent with an IP destination address of
> 224.0.0.22, to which all IGMPv3-capable multicast routers listen. A
> system that is operating in version 1 or version 2 compatibility
> modes sends version 1 or version 2 Reports to the multicast group
> specified in the Group Address field of the Report. In addition, a
> system MUST accept and process any version 1 or version 2 Report
> whose IP Destination Address field contains *any* of the addresses
> (unicast or multicast) assigned to the interface on which the Report
> arrives.
Argh. Another MC group. And the ib layer does need to do IB level joins
for those. So the initial messages will be lost for the first join(s)?
> > Certainly a solution for the igmp messages themselves but not for
> > initial traffic or traffic send via sendonly join.
>
> Using .22 will generally solve the problems with sychronizing the
> IPoIB gateway to the state of the IGMPv3 clients. Yes, there will
> still be some unknown lag in building the IB side of the network and
> for the router(s) to get ready to handle the group - but at least it
> is no longer dependent on any timeouts.
How do you propose to handle the IB level join to 224.0.0.22 to avoid
packet loss there? IGMP messages will still get lost because of that.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051535480.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-05 20:55 ` Jason Gunthorpe
[not found] ` <20101005205545.GF24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-05 20:55 UTC (permalink / raw)
To: Christoph Lameter
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, Oct 05, 2010 at 03:43:47PM -0500, Christoph Lameter wrote:
> > Using .22 will generally solve the problems with sychronizing the
> > IPoIB gateway to the state of the IGMPv3 clients. Yes, there will
> > still be some unknown lag in building the IB side of the network and
> > for the router(s) to get ready to handle the group - but at least it
> > is no longer dependent on any timeouts.
>
> How do you propose to handle the IB level join to 224.0.0.22 to avoid
> packet loss there? IGMP messages will still get lost because of that.
First, the routers all join the group at startup and stay joined
forever. This avoids the race in the route joining a new MGID after
the client creates it, but before the IGMPv2 report is sent. I expect
this is a major source of delay and uncertainty
Second, since all clients join this group as send-only it becomes
possible for the SM to do reasonable things - for instance the MLID
can be pre-provisioned as send-only from any end-port and thus after
the SM replies with a MLID the MLID is guaranteed good for send-only
use immediately.
Third, once the client etners IGMPv3 mode and joins the group (maybe
at system boot?) it stays joined forever.
Finally, by sending multicast packets to the broadcast during the time
the MLID is unknown we can pretty much guarantee that the first IGMPv3
packet that is sent to .22 will reach all routers in a timely fashion.
(Hence my objection to Aleksey's approach)
Basically, this completely solves the IGMP client to IPoIB router
communication problem. Yes, there will still be an unknown time until
the IB network, router, and whatever is beyond the router is ready to
actually process packets on a new group - BUT that is normal for IP
multicast! The main point is that without lost IGMP packets things can
proceed without relying on timeouts.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <20101005205545.GF24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-10-05 21:12 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051600190.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 21:12 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
> > How do you propose to handle the IB level join to 224.0.0.22 to avoid
> > packet loss there? IGMP messages will still get lost because of that.
>
> First, the routers all join the group at startup and stay joined
> forever. This avoids the race in the route joining a new MGID after
> the client creates it, but before the IGMPv2 report is sent. I expect
> this is a major source of delay and uncertainty
I think the current routers join 224.0.0.2 already. Adding another MC
group should come with IGMPv3 support.
> Second, since all clients join this group as send-only it becomes
> possible for the SM to do reasonable things - for instance the MLID
> can be pre-provisioned as send-only from any end-port and thus after
> the SM replies with a MLID the MLID is guaranteed good for send-only
> use immediately.
The problem is that the client join on 224.0.0.22 will be delayed due to
fabric reconfig. The group is joined on demand. It is not automatically
joined.
> Third, once the client etners IGMPv3 mode and joins the group (maybe
> at system boot?) it stays joined forever.
IGMP does not explicitly join 224.0.0.X groups. Looks like messages to
224.0.0.X will not be send unless there is no other responder on the
subnet. So the initial messages for the first join getting lost
may still be a problem.
> Finally, by sending multicast packets to the broadcast during the time
> the MLID is unknown we can pretty much guarantee that the first IGMPv3
> packet that is sent to .22 will reach all routers in a timely fashion.
> (Hence my objection to Aleksey's approach)
Right. So the multicast traffic will flow to the broadcast address until
the SM sends the response. The multicast traffic will then get lost until
the fabric reconfig is complete.
> Basically, this completely solves the IGMP client to IPoIB router
> communication problem. Yes, there will still be an unknown time until
> the IB network, router, and whatever is beyond the router is ready to
> actually process packets on a new group - BUT that is normal for IP
> multicast! The main point is that without lost IGMP packets things can
> proceed without relying on timeouts.
Sure this sounds to be a much better approach (we have thought through
such approaches here repeatedly) but I do not know of any IB gateway that
supports IGMPv3.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051600190.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-05 21:28 ` Jason Gunthorpe
[not found] ` <20101005212826.GG24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-05 21:28 UTC (permalink / raw)
To: Christoph Lameter
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, Oct 05, 2010 at 04:12:59PM -0500, Christoph Lameter wrote:
> On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
>
> > > How do you propose to handle the IB level join to 224.0.0.22 to avoid
> > > packet loss there? IGMP messages will still get lost because of that.
> >
> > First, the routers all join the group at startup and stay joined
> > forever. This avoids the race in the route joining a new MGID after
> > the client creates it, but before the IGMPv2 report is sent. I expect
> > this is a major source of delay and uncertainty
>
> I think the current routers join 224.0.0.2 already. Adding another MC
> group should come with IGMPv3 support.
Sure, .22 is definately something routers need to have with IGMPv3.
> > Second, since all clients join this group as send-only it becomes
> > possible for the SM to do reasonable things - for instance the MLID
> > can be pre-provisioned as send-only from any end-port and thus after
> > the SM replies with a MLID the MLID is guaranteed good for send-only
> > use immediately.
>
> The problem is that the client join on 224.0.0.22 will be delayed due to
> fabric reconfig. The group is joined on demand. It is not automatically
> joined.
I was trying to explain that it is possible for the SM to provide a
MLID that is fully functional for .22 - there is no behind the scenes
network reconfiguring delay. This is doable with IGMPv3 because the
client join is send-only and all the listeners have been joined for a
long time.
Basically, the SM pushes out an all end-ports send-only configuration
for the MLID when the listeners join. So there *is no reconfiguration*
for a new send-only join to complete. No reconfiguration means no lost
packets.
Not sure if any SMs work this way already but they already have
special support for things like the IPv4 broadcast so it is completely
reasonable to have special support for IGMPv3 all routers as well.
A 'fast send-only join' configurable for MGIDs would do the job.
There is virtually no cost with preconfiguring switches for send only
traffic.
> > Finally, by sending multicast packets to the broadcast during the time
> > the MLID is unknown we can pretty much guarantee that the first IGMPv3
> > packet that is sent to .22 will reach all routers in a timely fashion.
> > (Hence my objection to Aleksey's approach)
>
> Right. So the multicast traffic will flow to the broadcast address until
> the SM sends the response. The multicast traffic will then get lost until
> the fabric reconfig is complete.
See above, that is avoidable with some SM help too.
> Sure this sounds to be a much better approach (we have thought through
> such approaches here repeatedly) but I do not know of any IB gateway that
> supports IGMPv3.
Lean on the vendors :( Seems crazy to not implement v3 when v2 is so
unworkable on IB.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <20101005212826.GG24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2010-10-05 21:34 ` Christoph Lameter
0 siblings, 0 replies; 17+ messages in thread
From: Christoph Lameter @ 2010-10-05 21:34 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Aleksey Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
> > The problem is that the client join on 224.0.0.22 will be delayed due to
> > fabric reconfig. The group is joined on demand. It is not automatically
> > joined.
>
> I was trying to explain that it is possible for the SM to provide a
> MLID that is fully functional for .22 - there is no behind the scenes
> network reconfiguring delay. This is doable with IGMPv3 because the
> client join is send-only and all the listeners have been joined for a
> long time.
Ahh.. Good idea.
> There is virtually no cost with preconfiguring switches for send only
> traffic.
True.
> > Sure this sounds to be a much better approach (we have thought through
> > such approaches here repeatedly) but I do not know of any IB gateway that
> > supports IGMPv3.
>
> Lean on the vendors :( Seems crazy to not implement v3 when v2 is so
> unworkable on IB.
Oh we will.... Do obsidian routers support IGMPv3 for IB?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051415100.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-06 15:57 ` Alekseys Senin
[not found] ` <1286380676.31487.1.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Alekseys Senin @ 2010-10-06 15:57 UTC (permalink / raw)
To: Christoph Lameter
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Tue, 2010-10-05 at 14:18 -0500, Christoph Lameter wrote:
> On Tue, 5 Oct 2010, Aleksey Senin wrote:
>
> > When using slow SM allow more packets to be buffered before answer
> > comming back. This patch based on idea of Christoph Lameter.
>
> I agree, I think we need to have those things configurable.
>
> Reviewed-by: Christoph Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
>
> How do you handle the situation of the SM responding before the fabric has
> been reconfigured? I do not see any delay on join. So they will be dropped
> if the fabric was not reconfigured fast enough? Or does the SM somehow
> delay the response?
I think this issue, that should solve the problem of sending or delaying
packets after obtaining MLID should be solved in another patch. Proposed
code improve today situation, when you can't change it at all.
Relating to broadcast, I don't think that this is a good solution it
will bring unwarranted load, specially in the case if no MLID received.
The way of adding delay before we start to send packages, seems to me
better.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <1286380676.31487.1.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
@ 2010-10-06 16:16 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010061114590.31538-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
0 siblings, 1 reply; 17+ messages in thread
From: Christoph Lameter @ 2010-10-06 16:16 UTC (permalink / raw)
To: Alekseys Senin
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Wed, 6 Oct 2010, Alekseys Senin wrote:
> > How do you handle the situation of the SM responding before the fabric has
> > been reconfigured? I do not see any delay on join. So they will be dropped
> > if the fabric was not reconfigured fast enough? Or does the SM somehow
> > delay the response?
>
> I think this issue, that should solve the problem of sending or delaying
> packets after obtaining MLID should be solved in another patch. Proposed
> code improve today situation, when you can't change it at all.
I agree with that.
> Relating to broadcast, I don't think that this is a good solution it
> will bring unwarranted load, specially in the case if no MLID received.
> The way of adding delay before we start to send packages, seems to me
> better.
Broadcast is a temporary solution and it only occurs with in an IB
partition. It would be a problem if one host suddenly starts sending
at full line speed. Maybe have that configurable?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010061114590.31538-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
@ 2010-10-06 17:34 ` Jason Gunthorpe
0 siblings, 0 replies; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-06 17:34 UTC (permalink / raw)
To: Christoph Lameter
Cc: Alekseys Senin, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org,
Roland Dreier, Moni Shoua
On Wed, Oct 06, 2010 at 11:16:59AM -0500, Christoph Lameter wrote:
> > Relating to broadcast, I don't think that this is a good solution it
> > will bring unwarranted load, specially in the case if no MLID received.
> > The way of adding delay before we start to send packages, seems to me
> > better.
>
> Broadcast is a temporary solution and it only occurs with in an IB
> partition. It would be a problem if one host suddenly starts sending
> at full line speed. Maybe have that configurable?
Right, and keep in mind that sending a packet with the broadcast MLID
and group-specific MGID will still result in the HCA's filtering based
on MGID. Personally, I'm not concerned about flooding - this is how
things work in ethernet - any sane app will need some way to cope with
that.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <alpine.DEB.2.00.1010051410370.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-05 19:54 ` Jason Gunthorpe
@ 2010-10-12 16:29 ` Alekseys Senin
[not found] ` <1286900993.31931.4.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
1 sibling, 1 reply; 17+ messages in thread
From: Alekseys Senin @ 2010-10-12 16:29 UTC (permalink / raw)
To: Christoph Lameter
Cc: Jason Gunthorpe, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org, Moni Shoua
[-- Attachment #1.1: Type: text/plain, Size: 942 bytes --]
On Tue, 2010-10-05 at 14:12 -0500, Christoph Lameter wrote:
> On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
>
> > On Tue, Oct 05, 2010 at 06:07:37PM +0200, Aleksey Senin wrote:
> > > When using slow SM allow more packets to be buffered before answer
> > > comming back. This patch based on idea of Christoph Lameter.
> > >
> > > http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
> >
> > IMHO, I think it is better to send multicasts to the broadcast MLID than to
> > queue them.. More like ethernet that way.
>
> I agree. We had similar ideas. However, the kernel does send igmp
> reports to the MC address not to 244.0.0.2. We would have to redirect at
> the IB layer until multicast via MLID becomes functional. We cannot tell
> when that will be the case.
>
>
But what if it will not be available from some reason? How long should
we wait? Do we need implement another queue/counter/timeout?
[-- Attachment #1.2: Type: text/html, Size: 1318 bytes --]
[-- Attachment #2: Type: text/plain, Size: 176 bytes --]
_______________________________________________
ewg mailing list
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH] Make multicast and path record queue flexible.
[not found] ` <1286900993.31931.4.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
@ 2010-10-12 20:47 ` Jason Gunthorpe
0 siblings, 0 replies; 17+ messages in thread
From: Jason Gunthorpe @ 2010-10-12 20:47 UTC (permalink / raw)
To: Alekseys Senin
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Christoph Lameter,
ewg-ZwoEplunGu1OwGhvXhtEPSCwEArCW2h5@public.gmane.org
On Tue, Oct 12, 2010 at 06:29:53PM +0200, Alekseys Senin wrote:
> On Tue, 2010-10-05 at 14:12 -0500, Christoph Lameter wrote:
>
> On Tue, 5 Oct 2010, Jason Gunthorpe wrote:
>
> > On Tue, Oct 05, 2010 at 06:07:37PM +0200, Aleksey Senin wrote:
> > > When using slow SM allow more packets to be buffered before answer
> > > comming back. This patch based on idea of Christoph Lameter.
> > >
> > > http://lists.openfabrics.org/pipermail/general/2009-June/059853.html
> >
> > IMHO, I think it is better to send multicasts to the broadcast MLID than to
> > queue them.. More like ethernet that way.
>
> I agree. We had similar ideas. However, the kernel does send igmp
> reports to the MC address not to 244.0.0.2. We would have to redirect at
> the IB layer until multicast via MLID becomes functional. We cannot tell
> when that will be the case.
>
> But what if it will not be available from some reason? How long
> should we wait? Do we need implement another queue/counter/timeout?
If you follow the scheme I outlined - where traffic to a MGID that
doesn't yet have a MLID is routed to the broadcast MLID then you do it
until you get a MLID, with periodic retries/refreshes of the SA
operation.
This is similar to how ethernet works, and is generally
harmless. Better to have a working, but suboptimal network, than one
that is busted.
Jason
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2010-10-12 20:47 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-10-05 16:07 [PATCH] Make multicast and path record queue flexible Aleksey Senin
[not found] ` <4CAB4D49.9090107-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2010-10-05 16:28 ` Jason Gunthorpe
[not found] ` <20101005162833.GC5967-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 19:12 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051410370.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-05 19:54 ` Jason Gunthorpe
[not found] ` <20101005195428.GB24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 20:02 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051500330.8786-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-05 20:21 ` Jason Gunthorpe
[not found] ` <20101005202121.GC24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 20:43 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051535480.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-05 20:55 ` Jason Gunthorpe
[not found] ` <20101005205545.GF24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 21:12 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051600190.10603-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-05 21:28 ` Jason Gunthorpe
[not found] ` <20101005212826.GG24268-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2010-10-05 21:34 ` Christoph Lameter
2010-10-12 16:29 ` Alekseys Senin
[not found] ` <1286900993.31931.4.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
2010-10-12 20:47 ` Jason Gunthorpe
2010-10-05 19:18 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010051415100.7065-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-06 15:57 ` Alekseys Senin
[not found] ` <1286380676.31487.1.camel-uOVkuFIEnOODI2cvxHXf6UEOCMrvLtNR@public.gmane.org>
2010-10-06 16:16 ` Christoph Lameter
[not found] ` <alpine.DEB.2.00.1010061114590.31538-sBS69tsa9Uj/9pzu0YdTqQ@public.gmane.org>
2010-10-06 17:34 ` Jason Gunthorpe
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox