* [PATCH v2] opensm: Multicast root switch calculation
@ 2009-12-03 14:11 Slava Strebkov
[not found] ` <4B17C712.9010109-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
0 siblings, 1 reply; 10+ messages in thread
From: Slava Strebkov @ 2009-12-03 14:11 UTC (permalink / raw)
To: linux-rdma-u79uwXL29TY76Z2rM5mHXA
Proposed new algorithm for calculation of root switch for multicast
spanning tree. Only edge switches(those connected to hosts) and
switches - multicast members themselves are involved in root calculation.
This gives improvement, especially on large fabrics, since number of
switches usually much less then the number of ports, shared same mcast
group.
Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org>
---
opensm/include/opensm/osm_switch.h | 12 ++++
opensm/opensm/osm_mcast_mgr.c | 129 ++++++++++++++++++++++++++++++++---
2 files changed, 130 insertions(+), 11 deletions(-)
diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
index 205896d..51c4251 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -109,6 +109,9 @@ typedef struct osm_switch {
unsigned endport_links;
unsigned need_update;
void *priv;
+ cl_map_item_t mcast_item;
+ uint32_t num_of_mcm;
+ uint8_t is_mc_member;
} osm_switch_t;
/*
* FIELDS
@@ -151,6 +154,15 @@ typedef struct osm_switch {
* When set indicates that switch was probably reset, so
* fwd tables and rest cached data should be flushed
*
+* mcast_item
+* map item for switch in building mcast tree
+*
+* num_of_mcm
+* number of mcast members(ports) connected to switch
+*
+* is_mc_member
+* whether switch is a mcast member itself
+*
* SEE ALSO
* Switch object
*********/
diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
index 616584f..f40709f 100644
--- a/opensm/opensm/osm_mcast_mgr.c
+++ b/opensm/opensm/osm_mcast_mgr.c
@@ -224,6 +224,117 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l,
return (float)max_hops;
}
+static void mcast_mgr_build_switch_map(osm_sm_t * sm,
+ const cl_qlist_t * port_list,
+ cl_qmap_t * p_mcast_member_sw_tbl)
+{
+ osm_switch_t *remote_sw;
+ cl_list_item_t *list_item;
+ osm_port_t *p_port;
+ ib_net64_t port_guid;
+ osm_physp_t *p_physp_remote;
+ osm_node_t *remote_node;
+ osm_mcast_work_obj_t *wobj;
+
+ OSM_LOG_ENTER(sm->p_log);
+
+ cl_qmap_init(p_mcast_member_sw_tbl);
+ for (list_item = cl_qlist_head(port_list);
+ list_item != cl_qlist_end(port_list);
+ list_item = cl_qlist_next(list_item)) {
+ wobj = cl_item_obj(list_item, wobj, list_item);
+ p_port = wobj->p_port;
+ if (!p_port)
+ continue;
+ if (p_port->p_node->sw) {
+ /* for switches - remote switch would be the switch itself */
+ remote_node = osm_physp_get_node_ptr(p_port->p_physp);
+ } else {
+ p_physp_remote = osm_physp_get_remote(p_port->p_physp);
+ remote_node = osm_physp_get_node_ptr(p_physp_remote);
+ }
+ /* get the remote switch of the mcmember */
+ remote_sw = remote_node->sw;
+ port_guid = osm_node_get_node_guid(remote_node);
+ if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) ==
+ cl_qmap_end(p_mcast_member_sw_tbl)) {
+ /* insert switch to table */
+ cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mcast_item);
+ /* New element in the table */
+ if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA)
+ /* for HCA update the MC count on the remote switch */
+ remote_sw->num_of_mcm++;
+ else
+ /* the switch is MC memeber */
+ remote_sw->is_mc_member = 1;
+ }
+ }
+ OSM_LOG_EXIT(sm->p_log);
+}
+
+static void mcast_mgr_destroy_switch_map(osm_sm_t * sm,
+ cl_qmap_t *p_mcast_member_sw_tbl)
+{
+ cl_map_item_t *p_item;
+ osm_switch_t *p_sw;
+
+ OSM_LOG_ENTER(sm->p_log);
+
+ p_item = cl_qmap_head(p_mcast_member_sw_tbl);
+ while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) {
+ p_sw = PARENT_STRUCT(p_item, osm_switch_t, mcast_item);
+ p_sw->num_of_mcm = 0;
+ p_sw->is_mc_member = 0;
+ p_item = cl_qmap_next(p_item);
+ }
+ cl_qmap_remove_all(p_mcast_member_sw_tbl);
+ OSM_LOG_EXIT(sm->p_log);
+}
+
+static float
+osm_mcast_mgr_compute_avg_hops_weight(osm_sm_t * sm,
+ const osm_switch_t * const p_sw_cur,
+ const cl_qmap_t * p_sw_tbl)
+{
+ float avg_hops_weight = 0;
+ uint32_t hops = 0;
+ uint32_t num_ports = 0;
+ uint16_t base_lid_ho;
+ uint32_t tmp_hops;
+ uint32_t least_hops;
+ osm_switch_t *p_sw;
+ cl_map_item_t *p_item;
+
+ OSM_LOG_ENTER(sm->p_log);
+ /*
+ For each member of the multicast group, compute the
+ number of hops to its base LID.
+ */
+ for (p_item = cl_qmap_head(p_sw_tbl);
+ p_item != cl_qmap_end(p_sw_tbl); p_item = cl_qmap_next(p_item)) {
+ p_sw =
+ (osm_switch_t *) PARENT_STRUCT(p_item, osm_switch_t,
+ mcast_item);
+ base_lid_ho = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0));
+ least_hops = osm_switch_get_least_hops(p_sw_cur, base_lid_ho);
+ /* for all host that are MC members and attached to the switch,
+ we should add the (least_hops + 1) * number_of_such_hosts.
+ If switch itself is in the MC, we should add the least_hops only */
+ tmp_hops =
+ (least_hops + 1) * p_sw->num_of_mcm +
+ least_hops * p_sw->is_mc_member;
+ hops += tmp_hops;
+ num_ports += p_sw->num_of_mcm + p_sw->is_mc_member;
+ }
+
+ CL_ASSERT(num_ports);
+ if (num_ports != 0)
+ avg_hops_weight = (hops) / num_ports;
+
+ OSM_LOG_EXIT(sm->p_log);
+ return (avg_hops_weight);
+}
+
/**********************************************************************
This function attempts to locate the optimal switch for the
center of the spanning tree. The current algorithm chooses
@@ -231,32 +342,27 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l,
of the multicast group.
**********************************************************************/
static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm,
- cl_qlist_t *list)
+ cl_qlist_t * list)
{
cl_qmap_t *p_sw_tbl;
osm_switch_t *p_sw, *p_best_sw = NULL;
float hops = 0;
float best_hops = 10000; /* any big # will do */
-#ifdef OSM_VENDOR_INTF_ANAFA
- boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */
-#else
- boolean_t use_avg_hops = FALSE; /* use max hops for root */
-#endif
-
+ cl_qmap_t mcast_member_sw_tbl;
OSM_LOG_ENTER(sm->p_log);
p_sw_tbl = &sm->p_subn->sw_guid_tbl;
+ mcast_mgr_build_switch_map(sm, list, &mcast_member_sw_tbl);
for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl);
p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
if (!osm_switch_supports_mcast(p_sw))
continue;
- if (use_avg_hops)
- hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw);
- else
- hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw);
+ hops =
+ osm_mcast_mgr_compute_avg_hops_weight(sm, p_sw,
+ &mcast_member_sw_tbl);
OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
"Switch 0x%016" PRIx64 ", hops = %f\n",
@@ -277,6 +383,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm,
OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
"No multicast capable switches detected\n");
+ mcast_mgr_destroy_switch_map(sm, &mcast_member_sw_tbl);
OSM_LOG_EXIT(sm->p_log);
return p_best_sw;
}
--
1.6.3.3
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 10+ messages in thread[parent not found: <4B17C712.9010109-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>]
* Re: [PATCH v2] opensm: Multicast root switch calculation [not found] ` <4B17C712.9010109-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org> @ 2010-01-20 10:27 ` Sasha Khapyorsky 2010-01-20 11:32 ` Slava Strebkov 0 siblings, 1 reply; 10+ messages in thread From: Sasha Khapyorsky @ 2010-01-20 10:27 UTC (permalink / raw) To: Slava Strebkov Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik Hi Slava, On 16:11 Thu 03 Dec , Slava Strebkov wrote: > Proposed new algorithm for calculation of root switch for multicast > spanning tree. Only edge switches(those connected to hosts) and > switches - multicast members themselves are involved in root calculation. > This gives improvement, especially on large fabrics, since number of > switches usually much less then the number of ports, shared same mcast > group. > > Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> > --- [snip] > @@ -231,32 +342,27 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > of the multicast group. > **********************************************************************/ > static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > - cl_qlist_t *list) > + cl_qlist_t * list) > { > cl_qmap_t *p_sw_tbl; > osm_switch_t *p_sw, *p_best_sw = NULL; > float hops = 0; > float best_hops = 10000; /* any big # will do */ > -#ifdef OSM_VENDOR_INTF_ANAFA > - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ > -#else > - boolean_t use_avg_hops = FALSE; /* use max hops for root */ > -#endif > - > + cl_qmap_t mcast_member_sw_tbl; > OSM_LOG_ENTER(sm->p_log); > > p_sw_tbl = &sm->p_subn->sw_guid_tbl; > > + mcast_mgr_build_switch_map(sm, list, &mcast_member_sw_tbl); > for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); > p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); > p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { > if (!osm_switch_supports_mcast(p_sw)) > continue; > > - if (use_avg_hops) > - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); > - else > - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); > + hops = > + osm_mcast_mgr_compute_avg_hops_weight(sm, p_sw, > + &mcast_member_sw_tbl); Any reason why was root switch computation method changed? By default it was "max hops" based and as far as I can see you changed this to "average hops". Sasha > > OSM_LOG(sm->p_log, OSM_LOG_DEBUG, > "Switch 0x%016" PRIx64 ", hops = %f\n", > @@ -277,6 +383,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, > "No multicast capable switches detected\n"); > > + mcast_mgr_destroy_switch_map(sm, &mcast_member_sw_tbl); > OSM_LOG_EXIT(sm->p_log); > return p_best_sw; > } > -- > 1.6.3.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
* RE: [PATCH v2] opensm: Multicast root switch calculation 2010-01-20 10:27 ` Sasha Khapyorsky @ 2010-01-20 11:32 ` Slava Strebkov [not found] ` <39C75744D164D948A170E9792AF8E7CA01F6FA8A-QfUkFaTmzUSUvQqKE/ONIwC/G2K4zDHf@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Slava Strebkov @ 2010-01-20 11:32 UTC (permalink / raw) To: Sasha Khapyorsky Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik Hi Sasha, "average hops" was chosen instead of "max hops" because in root weight calculation the number of ports is also important, not only the distance (hops). Slava -----Original Message----- From: Sasha Khapyorsky [mailto:sashakvolt-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org] On Behalf Of Sasha Khapyorsky Sent: Wednesday, January 20, 2010 12:27 PM To: Slava Strebkov Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Eli Dorfman; Or Gerlitz; Yevgeny Kliteynik Subject: Re: [PATCH v2] opensm: Multicast root switch calculation Hi Slava, On 16:11 Thu 03 Dec , Slava Strebkov wrote: > Proposed new algorithm for calculation of root switch for multicast > spanning tree. Only edge switches(those connected to hosts) and > switches - multicast members themselves are involved in root calculation. > This gives improvement, especially on large fabrics, since number of > switches usually much less then the number of ports, shared same mcast > group. > > Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> > --- [snip] > @@ -231,32 +342,27 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > of the multicast group. > **********************************************************************/ > static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > - cl_qlist_t *list) > + cl_qlist_t * list) > { > cl_qmap_t *p_sw_tbl; > osm_switch_t *p_sw, *p_best_sw = NULL; > float hops = 0; > float best_hops = 10000; /* any big # will do */ > -#ifdef OSM_VENDOR_INTF_ANAFA > - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ > -#else > - boolean_t use_avg_hops = FALSE; /* use max hops for root */ > -#endif > - > + cl_qmap_t mcast_member_sw_tbl; > OSM_LOG_ENTER(sm->p_log); > > p_sw_tbl = &sm->p_subn->sw_guid_tbl; > > + mcast_mgr_build_switch_map(sm, list, &mcast_member_sw_tbl); > for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); > p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); > p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { > if (!osm_switch_supports_mcast(p_sw)) > continue; > > - if (use_avg_hops) > - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); > - else > - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); > + hops = > + osm_mcast_mgr_compute_avg_hops_weight(sm, p_sw, > + &mcast_member_sw_tbl); Any reason why was root switch computation method changed? By default it was "max hops" based and as far as I can see you changed this to "average hops". Sasha > > OSM_LOG(sm->p_log, OSM_LOG_DEBUG, > "Switch 0x%016" PRIx64 ", hops = %f\n", > @@ -277,6 +383,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, > "No multicast capable switches detected\n"); > > + mcast_mgr_destroy_switch_map(sm, &mcast_member_sw_tbl); > OSM_LOG_EXIT(sm->p_log); > return p_best_sw; > } > -- > 1.6.3.3 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <39C75744D164D948A170E9792AF8E7CA01F6FA8A-QfUkFaTmzUSUvQqKE/ONIwC/G2K4zDHf@public.gmane.org>]
* Re: [PATCH v2] opensm: Multicast root switch calculation [not found] ` <39C75744D164D948A170E9792AF8E7CA01F6FA8A-QfUkFaTmzUSUvQqKE/ONIwC/G2K4zDHf@public.gmane.org> @ 2010-01-20 11:59 ` Sasha Khapyorsky 2010-01-27 10:45 ` Sasha Khapyorsky 0 siblings, 1 reply; 10+ messages in thread From: Sasha Khapyorsky @ 2010-01-20 11:59 UTC (permalink / raw) To: Slava Strebkov Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On 13:32 Wed 20 Jan , Slava Strebkov wrote: > "average hops" was chosen instead of "max hops" because in root weight > calculation the number of ports is also important, not only the distance > (hops). But this patch is declared as root switch calculation optimization, not as algorithm change (actually I even missed this part in V1). If you think that an algorithm should be changed we need to handle this separately. And there could be a discussion - I have report that some days ago credit loop issue was observed when "average hops" method was used (and then it was replaced by "max hops"). I don't have much details about this case and cannot be sure that it was not "false alarm" (or outdated information), but anyway I think that we should be careful here. Sasha > -----Original Message----- > From: Sasha Khapyorsky [mailto:sashakvolt-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org] On Behalf Of Sasha > Khapyorsky > Sent: Wednesday, January 20, 2010 12:27 PM > To: Slava Strebkov > Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Eli Dorfman; Or Gerlitz; Yevgeny > Kliteynik > Subject: Re: [PATCH v2] opensm: Multicast root switch calculation > > Hi Slava, > > On 16:11 Thu 03 Dec , Slava Strebkov wrote: > > Proposed new algorithm for calculation of root switch for multicast > > spanning tree. Only edge switches(those connected to hosts) and > > switches - multicast members themselves are involved in root > calculation. > > This gives improvement, especially on large fabrics, since number of > > switches usually much less then the number of ports, shared same mcast > > group. > > > > Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> > > --- > > [snip] > > > @@ -231,32 +342,27 @@ static float > osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > > of the multicast group. > > > **********************************************************************/ > > static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > > - cl_qlist_t *list) > > + cl_qlist_t * list) > > { > > cl_qmap_t *p_sw_tbl; > > osm_switch_t *p_sw, *p_best_sw = NULL; > > float hops = 0; > > float best_hops = 10000; /* any big # will do */ > > -#ifdef OSM_VENDOR_INTF_ANAFA > > - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch > *//* use max hops for root */ > > -#else > > - boolean_t use_avg_hops = FALSE; /* use max hops for root */ > > -#endif > > - > > + cl_qmap_t mcast_member_sw_tbl; > > OSM_LOG_ENTER(sm->p_log); > > > > p_sw_tbl = &sm->p_subn->sw_guid_tbl; > > > > + mcast_mgr_build_switch_map(sm, list, &mcast_member_sw_tbl); > > for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); > > p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); > > p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { > > if (!osm_switch_supports_mcast(p_sw)) > > continue; > > > > - if (use_avg_hops) > > - hops = osm_mcast_mgr_compute_avg_hops(sm, list, > p_sw); > > - else > > - hops = osm_mcast_mgr_compute_max_hops(sm, list, > p_sw); > > + hops = > > + osm_mcast_mgr_compute_avg_hops_weight(sm, p_sw, > > + > &mcast_member_sw_tbl); > > Any reason why was root switch computation method changed? By default it > was "max hops" based and as far as I can see you changed this to > "average hops". > > Sasha > > > > > OSM_LOG(sm->p_log, OSM_LOG_DEBUG, > > "Switch 0x%016" PRIx64 ", hops = %f\n", > > @@ -277,6 +383,7 @@ static osm_switch_t > *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > > OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, > > "No multicast capable switches detected\n"); > > > > + mcast_mgr_destroy_switch_map(sm, &mcast_member_sw_tbl); > > OSM_LOG_EXIT(sm->p_log); > > return p_best_sw; > > } > > -- > > 1.6.3.3 > > > > -- > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" > in > > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v2] opensm: Multicast root switch calculation 2010-01-20 11:59 ` Sasha Khapyorsky @ 2010-01-27 10:45 ` Sasha Khapyorsky 2010-01-28 15:39 ` Hal Rosenstock 0 siblings, 1 reply; 10+ messages in thread From: Sasha Khapyorsky @ 2010-01-27 10:45 UTC (permalink / raw) To: Slava Strebkov Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On 13:59 Wed 20 Jan , Sasha Khapyorsky wrote: > On 13:32 Wed 20 Jan , Slava Strebkov wrote: > > "average hops" was chosen instead of "max hops" because in root weight > > calculation the number of ports is also important, not only the distance > > (hops). > > But this patch is declared as root switch calculation optimization, not > as algorithm change (actually I even missed this part in V1). I reworked this patch preserving original ("max hops") calculation method. Please look at this. The next step is to evaluate "max hops" -> "average hops" switch and to cleanup OSM_VENDOR_INTF_ANAFA macro. Sasha From: Slava Strebkov <slavas-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org> Date: Thu, 3 Dec 2009 16:11:30 +0200 Subject: [PATCH] opensm: Multicast root switch calculation Proposed new algorithm for calculation of root switch for multicast spanning tree. Only edge switches(those connected to hosts) and switches - multicast members themselves are involved in root calculation. This gives improvement, especially on large fabrics, since number of switches usually much less then the number of ports, shared same mcast group. Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> Signed-off-by: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org> --- opensm/include/opensm/osm_switch.h | 12 +++ opensm/opensm/osm_mcast_mgr.c | 149 ++++++++++++++++++++++++++--------- 2 files changed, 122 insertions(+), 39 deletions(-) diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h index 205896d..cb6e5ac 100644 --- a/opensm/include/opensm/osm_switch.h +++ b/opensm/include/opensm/osm_switch.h @@ -109,6 +109,9 @@ typedef struct osm_switch { unsigned endport_links; unsigned need_update; void *priv; + cl_map_item_t mgrp_item; + uint32_t num_of_mcm; + uint8_t is_mc_member; } osm_switch_t; /* * FIELDS @@ -151,6 +154,15 @@ typedef struct osm_switch { * When set indicates that switch was probably reset, so * fwd tables and rest cached data should be flushed * +* mgrp_item +* map item for switch in building mcast tree +* +* num_of_mcm +* number of mcast members(ports) connected to switch +* +* is_mc_member +* whether switch is a mcast member itself +* * SEE ALSO * Switch object *********/ diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c index dce9f2b..5c9d0bc 100644 --- a/opensm/opensm/osm_mcast_mgr.c +++ b/opensm/opensm/osm_mcast_mgr.c @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) OSM_LOG_EXIT(sm->p_log); } -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +static void mcast_mgr_build_switch_map(osm_sm_t * sm, + const cl_qlist_t * port_list, + cl_qmap_t * p_mcast_member_sw_tbl) { - float avg_hops = 0; - uint32_t hops = 0; - uint32_t num_ports = 0; - cl_list_item_t *i; + osm_switch_t *remote_sw; + cl_list_item_t *list_item; + osm_port_t *p_port; + ib_net64_t port_guid; + osm_physp_t *p_physp_remote; + osm_node_t *remote_node; osm_mcast_work_obj_t *wobj; OSM_LOG_ENTER(sm->p_log); - /* - For each member of the multicast group, compute the - number of hops to its base LID. - */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); - num_ports++; + cl_qmap_init(p_mcast_member_sw_tbl); + for (list_item = cl_qlist_head(port_list); + list_item != cl_qlist_end(port_list); + list_item = cl_qlist_next(list_item)) { + wobj = cl_item_obj(list_item, wobj, list_item); + p_port = wobj->p_port; + if (!p_port) + continue; + if (p_port->p_node->sw) { + /* for switches - remote switch would be the switch itself */ + remote_node = osm_physp_get_node_ptr(p_port->p_physp); + } else { + p_physp_remote = osm_physp_get_remote(p_port->p_physp); + remote_node = osm_physp_get_node_ptr(p_physp_remote); + } + /* get the remote switch of the mcmember */ + remote_sw = remote_node->sw; + port_guid = osm_node_get_node_guid(remote_node); + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == + cl_qmap_end(p_mcast_member_sw_tbl)) { + /* insert switch to table */ + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); + /* New element in the table */ + if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA) + /* for HCA update the MC count on the remote switch */ + remote_sw->num_of_mcm++; + else + /* the switch is MC memeber */ + remote_sw->is_mc_member = 1; + } } + OSM_LOG_EXIT(sm->p_log); +} - /* - We should be here if there aren't any ports in the group. - */ - CL_ASSERT(num_ports); +static void mcast_mgr_destroy_switch_map(osm_sm_t * sm, + cl_qmap_t *p_mcast_member_sw_tbl) +{ + cl_map_item_t *p_item; + osm_switch_t *p_sw; - if (num_ports != 0) - avg_hops = (float)(hops / num_ports); + OSM_LOG_ENTER(sm->p_log); + p_item = cl_qmap_head(p_mcast_member_sw_tbl); + while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) { + p_sw = PARENT_STRUCT(p_item, osm_switch_t, mgrp_item); + p_sw->num_of_mcm = 0; + p_sw->is_mc_member = 0; + p_item = cl_qmap_next(p_item); + } + cl_qmap_remove_all(p_mcast_member_sw_tbl); OSM_LOG_EXIT(sm->p_log); - return avg_hops; } /********************************************************************** Calculate the maximal "min hops" from the given switch to any of the group HCAs **********************************************************************/ -static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +#ifdef OSM_VENDOR_INTF_ANAFA +static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) { - uint32_t max_hops = 0; + float avg_hops = 0; uint32_t hops = 0; - cl_list_item_t *i; - osm_mcast_work_obj_t *wobj; + uint32_t num_ports = 0; + uint16_t lid; + uint32_t least_hops; + cl_map_item_t *i; + osm_switch_t *sw; + + OSM_LOG_ENTER(sm->p_log); + + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mcast_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + least_hops = osm_switch_get_least_hops(this_sw, lid); + /* for all host that are MC members and attached to the switch, + we should add the (least_hops + 1) * number_of_such_hosts. + If switch itself is in the MC, we should add the least_hops only */ + hops += (least_hops + 1) * sw->num_of_mcm + + least_hops * sw->is_mc_member; + num_ports += sw->num_of_mcm + sw->is_mc_member; + } + + /* We should be here if there aren't any ports in the group. */ + CL_ASSERT(num_ports); + + avg_hops = (float)(hops / num_ports); + + OSM_LOG_EXIT(sm->p_log); + return avg_hops; +} +#else +static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) +{ + uint32_t max_hops = 0, hops; + uint16_t lid; + cl_map_item_t *i; + osm_switch_t *sw; OSM_LOG_ENTER(sm->p_log); @@ -208,9 +277,11 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, For each member of the multicast group, compute the number of hops to its base LID. */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops = osm_switch_get_port_least_hops(p_sw, wobj->p_port); + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mgrp_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + hops = osm_switch_get_least_hops(this_sw, lid); + hops = (hops + 1) * sw->num_of_mcm + hops * sw->is_mc_member; if (hops > max_hops) max_hops = hops; } @@ -222,6 +293,7 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, OSM_LOG_EXIT(sm->p_log); return (float)max_hops; } +#endif /********************************************************************** This function attempts to locate the optimal switch for the @@ -230,32 +302,30 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, of the multicast group. **********************************************************************/ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, - cl_qlist_t *list) + cl_qlist_t * list) { + cl_qmap_t mgrp_sw_map; cl_qmap_t *p_sw_tbl; osm_switch_t *p_sw, *p_best_sw = NULL; float hops = 0; float best_hops = 10000; /* any big # will do */ -#ifdef OSM_VENDOR_INTF_ANAFA - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ -#else - boolean_t use_avg_hops = FALSE; /* use max hops for root */ -#endif OSM_LOG_ENTER(sm->p_log); p_sw_tbl = &sm->p_subn->sw_guid_tbl; + mcast_mgr_build_switch_map(sm, list, &mgrp_sw_map); for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { if (!osm_switch_supports_mcast(p_sw)) continue; - if (use_avg_hops) - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); - else - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); +#ifdef OSM_VENDOR_INTF_ANAFA + hops = osm_mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); +#else + hops = osm_mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); +#endif OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Switch 0x%016" PRIx64 ", hops = %f\n", @@ -276,6 +346,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, "No multicast capable switches detected\n"); + mcast_mgr_destroy_switch_map(sm, &mgrp_sw_map); OSM_LOG_EXIT(sm->p_log); return p_best_sw; } -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v2] opensm: Multicast root switch calculation 2010-01-27 10:45 ` Sasha Khapyorsky @ 2010-01-28 15:39 ` Hal Rosenstock [not found] ` <f0e08f231001280739m31dae78cj5ab7bd78621dcbfc-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Hal Rosenstock @ 2010-01-28 15:39 UTC (permalink / raw) To: Sasha Khapyorsky Cc: Slava Strebkov, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On Wed, Jan 27, 2010 at 5:45 AM, Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org> wrote: > On 13:59 Wed 20 Jan , Sasha Khapyorsky wrote: >> On 13:32 Wed 20 Jan , Slava Strebkov wrote: >> > "average hops" was chosen instead of "max hops" because in root weight >> > calculation the number of ports is also important, not only the distance >> > (hops). >> >> But this patch is declared as root switch calculation optimization, not >> as algorithm change (actually I even missed this part in V1). > > I reworked this patch preserving original ("max hops") calculation > method. Please look at this. > > The next step is to evaluate "max hops" -> "average hops" switch and to > cleanup OSM_VENDOR_INTF_ANAFA macro. > > Sasha > > > From: Slava Strebkov <slavas-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org> > Date: Thu, 3 Dec 2009 16:11:30 +0200 > Subject: [PATCH] opensm: Multicast root switch calculation > > Proposed new algorithm for calculation of root switch for multicast > spanning tree. Only edge switches(those connected to hosts) What about switches whose peer port is a router ? Shouldn't they be included here ? > and > switches - multicast members themselves are involved in root calculation. > This gives improvement, especially on large fabrics, since number of > switches usually much less then the number of ports, shared same mcast > group. > > Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> > Signed-off-by: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org> > --- > opensm/include/opensm/osm_switch.h | 12 +++ > opensm/opensm/osm_mcast_mgr.c | 149 ++++++++++++++++++++++++++--------- > 2 files changed, 122 insertions(+), 39 deletions(-) > > diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h > index 205896d..cb6e5ac 100644 > --- a/opensm/include/opensm/osm_switch.h > +++ b/opensm/include/opensm/osm_switch.h > @@ -109,6 +109,9 @@ typedef struct osm_switch { > unsigned endport_links; > unsigned need_update; > void *priv; > + cl_map_item_t mgrp_item; > + uint32_t num_of_mcm; > + uint8_t is_mc_member; > } osm_switch_t; > /* > * FIELDS > @@ -151,6 +154,15 @@ typedef struct osm_switch { > * When set indicates that switch was probably reset, so > * fwd tables and rest cached data should be flushed > * > +* mgrp_item > +* map item for switch in building mcast tree > +* > +* num_of_mcm > +* number of mcast members(ports) connected to switch > +* > +* is_mc_member > +* whether switch is a mcast member itself > +* > * SEE ALSO > * Switch object > *********/ > diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c > index dce9f2b..5c9d0bc 100644 > --- a/opensm/opensm/osm_mcast_mgr.c > +++ b/opensm/opensm/osm_mcast_mgr.c > @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) > OSM_LOG_EXIT(sm->p_log); > } > > -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, > - const osm_switch_t * p_sw) > +static void mcast_mgr_build_switch_map(osm_sm_t * sm, > + const cl_qlist_t * port_list, > + cl_qmap_t * p_mcast_member_sw_tbl) > { > - float avg_hops = 0; > - uint32_t hops = 0; > - uint32_t num_ports = 0; > - cl_list_item_t *i; > + osm_switch_t *remote_sw; > + cl_list_item_t *list_item; > + osm_port_t *p_port; > + ib_net64_t port_guid; > + osm_physp_t *p_physp_remote; > + osm_node_t *remote_node; > osm_mcast_work_obj_t *wobj; > > OSM_LOG_ENTER(sm->p_log); > > - /* > - For each member of the multicast group, compute the > - number of hops to its base LID. > - */ > - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { > - wobj = cl_item_obj(i, wobj, list_item); > - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); > - num_ports++; > + cl_qmap_init(p_mcast_member_sw_tbl); > + for (list_item = cl_qlist_head(port_list); > + list_item != cl_qlist_end(port_list); > + list_item = cl_qlist_next(list_item)) { > + wobj = cl_item_obj(list_item, wobj, list_item); > + p_port = wobj->p_port; > + if (!p_port) > + continue; > + if (p_port->p_node->sw) { > + /* for switches - remote switch would be the switch itself */ > + remote_node = osm_physp_get_node_ptr(p_port->p_physp); > + } else { > + p_physp_remote = osm_physp_get_remote(p_port->p_physp); > + remote_node = osm_physp_get_node_ptr(p_physp_remote); > + } > + /* get the remote switch of the mcmember */ > + remote_sw = remote_node->sw; > + port_guid = osm_node_get_node_guid(remote_node); > + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == > + cl_qmap_end(p_mcast_member_sw_tbl)) { > + /* insert switch to table */ > + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); > + /* New element in the table */ > + if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA) > + /* for HCA update the MC count on the remote switch */ Should this be != IB_NODE_TYPE_SWITCH so that both CAs and routers are included here ? -- Hal > + remote_sw->num_of_mcm++; > + else > + /* the switch is MC memeber */ > + remote_sw->is_mc_member = 1; > + } > } > + OSM_LOG_EXIT(sm->p_log); > +} > > - /* > - We should be here if there aren't any ports in the group. > - */ > - CL_ASSERT(num_ports); > +static void mcast_mgr_destroy_switch_map(osm_sm_t * sm, > + cl_qmap_t *p_mcast_member_sw_tbl) > +{ > + cl_map_item_t *p_item; > + osm_switch_t *p_sw; > > - if (num_ports != 0) > - avg_hops = (float)(hops / num_ports); > + OSM_LOG_ENTER(sm->p_log); > > + p_item = cl_qmap_head(p_mcast_member_sw_tbl); > + while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) { > + p_sw = PARENT_STRUCT(p_item, osm_switch_t, mgrp_item); > + p_sw->num_of_mcm = 0; > + p_sw->is_mc_member = 0; > + p_item = cl_qmap_next(p_item); > + } > + cl_qmap_remove_all(p_mcast_member_sw_tbl); > OSM_LOG_EXIT(sm->p_log); > - return avg_hops; > } > > /********************************************************************** > Calculate the maximal "min hops" from the given switch to any > of the group HCAs > **********************************************************************/ > -static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > - const osm_switch_t * p_sw) > +#ifdef OSM_VENDOR_INTF_ANAFA > +static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, > + const osm_switch_t * this_sw) > { > - uint32_t max_hops = 0; > + float avg_hops = 0; > uint32_t hops = 0; > - cl_list_item_t *i; > - osm_mcast_work_obj_t *wobj; > + uint32_t num_ports = 0; > + uint16_t lid; > + uint32_t least_hops; > + cl_map_item_t *i; > + osm_switch_t *sw; > + > + OSM_LOG_ENTER(sm->p_log); > + > + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { > + sw = cl_item_obj(i, sw, mcast_item); > + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); > + least_hops = osm_switch_get_least_hops(this_sw, lid); > + /* for all host that are MC members and attached to the switch, > + we should add the (least_hops + 1) * number_of_such_hosts. > + If switch itself is in the MC, we should add the least_hops only */ > + hops += (least_hops + 1) * sw->num_of_mcm + > + least_hops * sw->is_mc_member; > + num_ports += sw->num_of_mcm + sw->is_mc_member; > + } > + > + /* We should be here if there aren't any ports in the group. */ > + CL_ASSERT(num_ports); > + > + avg_hops = (float)(hops / num_ports); > + > + OSM_LOG_EXIT(sm->p_log); > + return avg_hops; > +} > +#else > +static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, > + const osm_switch_t * this_sw) > +{ > + uint32_t max_hops = 0, hops; > + uint16_t lid; > + cl_map_item_t *i; > + osm_switch_t *sw; > > OSM_LOG_ENTER(sm->p_log); > > @@ -208,9 +277,11 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > For each member of the multicast group, compute the > number of hops to its base LID. > */ > - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { > - wobj = cl_item_obj(i, wobj, list_item); > - hops = osm_switch_get_port_least_hops(p_sw, wobj->p_port); > + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { > + sw = cl_item_obj(i, sw, mgrp_item); > + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); > + hops = osm_switch_get_least_hops(this_sw, lid); > + hops = (hops + 1) * sw->num_of_mcm + hops * sw->is_mc_member; > if (hops > max_hops) > max_hops = hops; > } > @@ -222,6 +293,7 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > OSM_LOG_EXIT(sm->p_log); > return (float)max_hops; > } > +#endif > > /********************************************************************** > This function attempts to locate the optimal switch for the > @@ -230,32 +302,30 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > of the multicast group. > **********************************************************************/ > static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > - cl_qlist_t *list) > + cl_qlist_t * list) > { > + cl_qmap_t mgrp_sw_map; > cl_qmap_t *p_sw_tbl; > osm_switch_t *p_sw, *p_best_sw = NULL; > float hops = 0; > float best_hops = 10000; /* any big # will do */ > -#ifdef OSM_VENDOR_INTF_ANAFA > - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ > -#else > - boolean_t use_avg_hops = FALSE; /* use max hops for root */ > -#endif > > OSM_LOG_ENTER(sm->p_log); > > p_sw_tbl = &sm->p_subn->sw_guid_tbl; > > + mcast_mgr_build_switch_map(sm, list, &mgrp_sw_map); > for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); > p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); > p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { > if (!osm_switch_supports_mcast(p_sw)) > continue; > > - if (use_avg_hops) > - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); > - else > - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); > +#ifdef OSM_VENDOR_INTF_ANAFA > + hops = osm_mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); > +#else > + hops = osm_mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); > +#endif > > OSM_LOG(sm->p_log, OSM_LOG_DEBUG, > "Switch 0x%016" PRIx64 ", hops = %f\n", > @@ -276,6 +346,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, > "No multicast capable switches detected\n"); > > + mcast_mgr_destroy_switch_map(sm, &mgrp_sw_map); > OSM_LOG_EXIT(sm->p_log); > return p_best_sw; > } > -- > 1.6.6.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <f0e08f231001280739m31dae78cj5ab7bd78621dcbfc-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH v2] opensm: Multicast root switch calculation [not found] ` <f0e08f231001280739m31dae78cj5ab7bd78621dcbfc-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2010-02-03 10:34 ` Sasha Khapyorsky 2010-02-03 10:39 ` [PATCH v4] " Sasha Khapyorsky 0 siblings, 1 reply; 10+ messages in thread From: Sasha Khapyorsky @ 2010-02-03 10:34 UTC (permalink / raw) To: Hal Rosenstock Cc: Slava Strebkov, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On 10:39 Thu 28 Jan , Hal Rosenstock wrote: > On Wed, Jan 27, 2010 at 5:45 AM, Sasha Khapyorsky <sashak-smomgflXvObQFizaE/u3fw@public.gmane.orgm> wrote: > > On 13:59 Wed 20 Jan , Sasha Khapyorsky wrote: > >> On 13:32 Wed 20 Jan , Slava Strebkov wrote: > >> > "average hops" was chosen instead of "max hops" because in root weight > >> > calculation the number of ports is also important, not only the distance > >> > (hops). > >> > >> But this patch is declared as root switch calculation optimization, not > >> as algorithm change (actually I even missed this part in V1). > > > > I reworked this patch preserving original ("max hops") calculation > > method. Please look at this. > > > > The next step is to evaluate "max hops" -> "average hops" switch and to > > cleanup OSM_VENDOR_INTF_ANAFA macro. > > > > Sasha > > > > > > From: Slava Strebkov <slavas-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org> > > Date: Thu, 3 Dec 2009 16:11:30 +0200 > > Subject: [PATCH] opensm: Multicast root switch calculation > > > > Proposed new algorithm for calculation of root switch for multicast > > spanning tree. Only edge switches(those connected to hosts) > > What about switches whose peer port is a router ? Shouldn't they be > included here ? Yes. > > > and > > switches - multicast members themselves are involved in root calculation. > > This gives improvement, especially on large fabrics, since number of > > switches usually much less then the number of ports, shared same mcast > > group. > > > > Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> > > Signed-off-by: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org> > > --- > > opensm/include/opensm/osm_switch.h | 12 +++ > > opensm/opensm/osm_mcast_mgr.c | 149 ++++++++++++++++++++++++++--------- > > 2 files changed, 122 insertions(+), 39 deletions(-) > > > > diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h > > index 205896d..cb6e5ac 100644 > > --- a/opensm/include/opensm/osm_switch.h > > +++ b/opensm/include/opensm/osm_switch.h > > @@ -109,6 +109,9 @@ typedef struct osm_switch { > > unsigned endport_links; > > unsigned need_update; > > void *priv; > > + cl_map_item_t mgrp_item; > > + uint32_t num_of_mcm; > > + uint8_t is_mc_member; > > } osm_switch_t; > > /* > > * FIELDS > > @@ -151,6 +154,15 @@ typedef struct osm_switch { > > * When set indicates that switch was probably reset, so > > * fwd tables and rest cached data should be flushed > > * > > +* mgrp_item > > +* map item for switch in building mcast tree > > +* > > +* num_of_mcm > > +* number of mcast members(ports) connected to switch > > +* > > +* is_mc_member > > +* whether switch is a mcast member itself > > +* > > * SEE ALSO > > * Switch object > > *********/ > > diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c > > index dce9f2b..5c9d0bc 100644 > > --- a/opensm/opensm/osm_mcast_mgr.c > > +++ b/opensm/opensm/osm_mcast_mgr.c > > @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) > > OSM_LOG_EXIT(sm->p_log); > > } > > > > -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, > > - const osm_switch_t * p_sw) > > +static void mcast_mgr_build_switch_map(osm_sm_t * sm, > > + const cl_qlist_t * port_list, > > + cl_qmap_t * p_mcast_member_sw_tbl) > > { > > - float avg_hops = 0; > > - uint32_t hops = 0; > > - uint32_t num_ports = 0; > > - cl_list_item_t *i; > > + osm_switch_t *remote_sw; > > + cl_list_item_t *list_item; > > + osm_port_t *p_port; > > + ib_net64_t port_guid; > > + osm_physp_t *p_physp_remote; > > + osm_node_t *remote_node; > > osm_mcast_work_obj_t *wobj; > > > > OSM_LOG_ENTER(sm->p_log); > > > > - /* > > - For each member of the multicast group, compute the > > - number of hops to its base LID. > > - */ > > - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { > > - wobj = cl_item_obj(i, wobj, list_item); > > - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); > > - num_ports++; > > + cl_qmap_init(p_mcast_member_sw_tbl); > > + for (list_item = cl_qlist_head(port_list); > > + list_item != cl_qlist_end(port_list); > > + list_item = cl_qlist_next(list_item)) { > > + wobj = cl_item_obj(list_item, wobj, list_item); > > + p_port = wobj->p_port; > > + if (!p_port) > > + continue; > > + if (p_port->p_node->sw) { > > + /* for switches - remote switch would be the switch itself */ > > + remote_node = osm_physp_get_node_ptr(p_port->p_physp); > > + } else { > > + p_physp_remote = osm_physp_get_remote(p_port->p_physp); > > + remote_node = osm_physp_get_node_ptr(p_physp_remote); > > + } > > + /* get the remote switch of the mcmember */ > > + remote_sw = remote_node->sw; > > + port_guid = osm_node_get_node_guid(remote_node); > > + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == > > + cl_qmap_end(p_mcast_member_sw_tbl)) { > > + /* insert switch to table */ > > + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); > > + /* New element in the table */ > > + if (osm_node_get_type(p_port->p_node) == IB_NODE_TYPE_CA) > > + /* for HCA update the MC count on the remote switch */ > > Should this be != IB_NODE_TYPE_SWITCH so that both CAs and routers are > included here ? Yes, obviously. Sasha -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v4] opensm: Multicast root switch calculation 2010-02-03 10:34 ` Sasha Khapyorsky @ 2010-02-03 10:39 ` Sasha Khapyorsky 2010-02-04 14:22 ` Hal Rosenstock 0 siblings, 1 reply; 10+ messages in thread From: Sasha Khapyorsky @ 2010-02-03 10:39 UTC (permalink / raw) To: Slava Strebkov Cc: Hal Rosenstock, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik From: Slava Strebkov <slavas-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org> Proposed new algorithm for calculation of root switch for multicast spanning tree. Only edge switches(those connected to hosts or routers) and switches - multicast members themselves are involved in root calculation. This gives improvement, especially on large fabrics, since number of switches usually much less then the number of ports, shared same mcast group. Signed-off-by: Slava Strebkov <slavas-smomgflXvOZWk0Htik3J/w@public.gmane.org> Signed-off-by: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org> --- opensm/include/opensm/osm_switch.h | 12 +++ opensm/opensm/osm_mcast_mgr.c | 149 ++++++++++++++++++++++++++--------- 2 files changed, 122 insertions(+), 39 deletions(-) diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h index 205896d..cb6e5ac 100644 --- a/opensm/include/opensm/osm_switch.h +++ b/opensm/include/opensm/osm_switch.h @@ -109,6 +109,9 @@ typedef struct osm_switch { unsigned endport_links; unsigned need_update; void *priv; + cl_map_item_t mgrp_item; + uint32_t num_of_mcm; + uint8_t is_mc_member; } osm_switch_t; /* * FIELDS @@ -151,6 +154,15 @@ typedef struct osm_switch { * When set indicates that switch was probably reset, so * fwd tables and rest cached data should be flushed * +* mgrp_item +* map item for switch in building mcast tree +* +* num_of_mcm +* number of mcast members(ports) connected to switch +* +* is_mc_member +* whether switch is a mcast member itself +* * SEE ALSO * Switch object *********/ diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c index dce9f2b..b50f360 100644 --- a/opensm/opensm/osm_mcast_mgr.c +++ b/opensm/opensm/osm_mcast_mgr.c @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) OSM_LOG_EXIT(sm->p_log); } -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +static void mcast_mgr_build_switch_map(osm_sm_t * sm, + const cl_qlist_t * port_list, + cl_qmap_t * p_mcast_member_sw_tbl) { - float avg_hops = 0; - uint32_t hops = 0; - uint32_t num_ports = 0; - cl_list_item_t *i; + osm_switch_t *remote_sw; + cl_list_item_t *list_item; + osm_port_t *p_port; + ib_net64_t port_guid; + osm_physp_t *p_physp_remote; + osm_node_t *remote_node; osm_mcast_work_obj_t *wobj; OSM_LOG_ENTER(sm->p_log); - /* - For each member of the multicast group, compute the - number of hops to its base LID. - */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); - num_ports++; + cl_qmap_init(p_mcast_member_sw_tbl); + for (list_item = cl_qlist_head(port_list); + list_item != cl_qlist_end(port_list); + list_item = cl_qlist_next(list_item)) { + wobj = cl_item_obj(list_item, wobj, list_item); + p_port = wobj->p_port; + if (!p_port) + continue; + if (p_port->p_node->sw) { + /* for switches - remote switch would be the switch itself */ + remote_node = osm_physp_get_node_ptr(p_port->p_physp); + } else { + p_physp_remote = osm_physp_get_remote(p_port->p_physp); + remote_node = osm_physp_get_node_ptr(p_physp_remote); + } + /* get the remote switch of the mcmember */ + remote_sw = remote_node->sw; + port_guid = osm_node_get_node_guid(remote_node); + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == + cl_qmap_end(p_mcast_member_sw_tbl)) { + /* insert switch to table */ + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); + /* New element in the table */ + if (p_port->p_node->sw) + /* the switch is MC memeber */ + remote_sw->is_mc_member = 1; + else + /* for others - update MC count */ + remote_sw->num_of_mcm++; + } } + OSM_LOG_EXIT(sm->p_log); +} - /* - We should be here if there aren't any ports in the group. - */ - CL_ASSERT(num_ports); +static void mcast_mgr_destroy_switch_map(osm_sm_t * sm, + cl_qmap_t *p_mcast_member_sw_tbl) +{ + cl_map_item_t *p_item; + osm_switch_t *p_sw; - if (num_ports != 0) - avg_hops = (float)(hops / num_ports); + OSM_LOG_ENTER(sm->p_log); + p_item = cl_qmap_head(p_mcast_member_sw_tbl); + while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) { + p_sw = PARENT_STRUCT(p_item, osm_switch_t, mgrp_item); + p_sw->num_of_mcm = 0; + p_sw->is_mc_member = 0; + p_item = cl_qmap_next(p_item); + } + cl_qmap_remove_all(p_mcast_member_sw_tbl); OSM_LOG_EXIT(sm->p_log); - return avg_hops; } /********************************************************************** Calculate the maximal "min hops" from the given switch to any of the group HCAs **********************************************************************/ -static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, - const osm_switch_t * p_sw) +#ifdef OSM_VENDOR_INTF_ANAFA +static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) { - uint32_t max_hops = 0; + float avg_hops = 0; uint32_t hops = 0; - cl_list_item_t *i; - osm_mcast_work_obj_t *wobj; + uint32_t num_ports = 0; + uint16_t lid; + uint32_t least_hops; + cl_map_item_t *i; + osm_switch_t *sw; + + OSM_LOG_ENTER(sm->p_log); + + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mcast_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + least_hops = osm_switch_get_least_hops(this_sw, lid); + /* for all host that are MC members and attached to the switch, + we should add the (least_hops + 1) * number_of_such_hosts. + If switch itself is in the MC, we should add the least_hops only */ + hops += (least_hops + 1) * sw->num_of_mcm + + least_hops * sw->is_mc_member; + num_ports += sw->num_of_mcm + sw->is_mc_member; + } + + /* We should be here if there aren't any ports in the group. */ + CL_ASSERT(num_ports); + + avg_hops = (float)(hops / num_ports); + + OSM_LOG_EXIT(sm->p_log); + return avg_hops; +} +#else +static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, + const osm_switch_t * this_sw) +{ + uint32_t max_hops = 0, hops; + uint16_t lid; + cl_map_item_t *i; + osm_switch_t *sw; OSM_LOG_ENTER(sm->p_log); @@ -208,9 +277,11 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, For each member of the multicast group, compute the number of hops to its base LID. */ - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { - wobj = cl_item_obj(i, wobj, list_item); - hops = osm_switch_get_port_least_hops(p_sw, wobj->p_port); + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { + sw = cl_item_obj(i, sw, mgrp_item); + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); + hops = osm_switch_get_least_hops(this_sw, lid); + hops = (hops + 1) * sw->num_of_mcm + hops * sw->is_mc_member; if (hops > max_hops) max_hops = hops; } @@ -222,6 +293,7 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, OSM_LOG_EXIT(sm->p_log); return (float)max_hops; } +#endif /********************************************************************** This function attempts to locate the optimal switch for the @@ -230,32 +302,30 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, of the multicast group. **********************************************************************/ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, - cl_qlist_t *list) + cl_qlist_t * list) { + cl_qmap_t mgrp_sw_map; cl_qmap_t *p_sw_tbl; osm_switch_t *p_sw, *p_best_sw = NULL; float hops = 0; float best_hops = 10000; /* any big # will do */ -#ifdef OSM_VENDOR_INTF_ANAFA - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ -#else - boolean_t use_avg_hops = FALSE; /* use max hops for root */ -#endif OSM_LOG_ENTER(sm->p_log); p_sw_tbl = &sm->p_subn->sw_guid_tbl; + mcast_mgr_build_switch_map(sm, list, &mgrp_sw_map); for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { if (!osm_switch_supports_mcast(p_sw)) continue; - if (use_avg_hops) - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); - else - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); +#ifdef OSM_VENDOR_INTF_ANAFA + hops = osm_mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); +#else + hops = osm_mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); +#endif OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "Switch 0x%016" PRIx64 ", hops = %f\n", @@ -276,6 +346,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, "No multicast capable switches detected\n"); + mcast_mgr_destroy_switch_map(sm, &mgrp_sw_map); OSM_LOG_EXIT(sm->p_log); return p_best_sw; } -- 1.6.6.1 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v4] opensm: Multicast root switch calculation 2010-02-03 10:39 ` [PATCH v4] " Sasha Khapyorsky @ 2010-02-04 14:22 ` Hal Rosenstock [not found] ` <f0e08f231002040622l32c2fc9blc41318828f01cf35-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Hal Rosenstock @ 2010-02-04 14:22 UTC (permalink / raw) To: Sasha Khapyorsky Cc: Slava Strebkov, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On Wed, Feb 3, 2010 at 5:39 AM, Sasha Khapyorsky <sashak@voltaire.com> wrote: > > From: Slava Strebkov <slavas@Voltaire.COM> > > Proposed new algorithm for calculation of root switch for multicast > spanning tree. Only edge switches(those connected to hosts or routers) > and switches - multicast members themselves are involved in root > calculation. This gives improvement, especially on large fabrics, since > number of switches usually much less then the number of ports, shared > same mcast group. > > Signed-off-by: Slava Strebkov <slavas@voltaire.com> > Signed-off-by: Sasha Khapyorsky <sashak@voltaire.com> > --- > opensm/include/opensm/osm_switch.h | 12 +++ > opensm/opensm/osm_mcast_mgr.c | 149 ++++++++++++++++++++++++++--------- > 2 files changed, 122 insertions(+), 39 deletions(-) > > diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h > index 205896d..cb6e5ac 100644 > --- a/opensm/include/opensm/osm_switch.h > +++ b/opensm/include/opensm/osm_switch.h > @@ -109,6 +109,9 @@ typedef struct osm_switch { > unsigned endport_links; > unsigned need_update; > void *priv; > + cl_map_item_t mgrp_item; > + uint32_t num_of_mcm; > + uint8_t is_mc_member; > } osm_switch_t; > /* > * FIELDS > @@ -151,6 +154,15 @@ typedef struct osm_switch { > * When set indicates that switch was probably reset, so > * fwd tables and rest cached data should be flushed > * > +* mgrp_item > +* map item for switch in building mcast tree > +* > +* num_of_mcm > +* number of mcast members(ports) connected to switch > +* > +* is_mc_member > +* whether switch is a mcast member itself > +* > * SEE ALSO > * Switch object > *********/ > diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c > index dce9f2b..b50f360 100644 > --- a/opensm/opensm/osm_mcast_mgr.c > +++ b/opensm/opensm/osm_mcast_mgr.c > @@ -157,50 +157,119 @@ static void mcast_mgr_purge_tree(osm_sm_t * sm, IN osm_mgrp_box_t * mbox) > OSM_LOG_EXIT(sm->p_log); > } > > -static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qlist_t * l, > - const osm_switch_t * p_sw) > +static void mcast_mgr_build_switch_map(osm_sm_t * sm, > + const cl_qlist_t * port_list, > + cl_qmap_t * p_mcast_member_sw_tbl) > { > - float avg_hops = 0; > - uint32_t hops = 0; > - uint32_t num_ports = 0; > - cl_list_item_t *i; > + osm_switch_t *remote_sw; > + cl_list_item_t *list_item; > + osm_port_t *p_port; > + ib_net64_t port_guid; > + osm_physp_t *p_physp_remote; > + osm_node_t *remote_node; > osm_mcast_work_obj_t *wobj; > > OSM_LOG_ENTER(sm->p_log); > > - /* > - For each member of the multicast group, compute the > - number of hops to its base LID. > - */ > - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { > - wobj = cl_item_obj(i, wobj, list_item); > - hops += osm_switch_get_port_least_hops(p_sw, wobj->p_port); > - num_ports++; > + cl_qmap_init(p_mcast_member_sw_tbl); > + for (list_item = cl_qlist_head(port_list); > + list_item != cl_qlist_end(port_list); > + list_item = cl_qlist_next(list_item)) { > + wobj = cl_item_obj(list_item, wobj, list_item); > + p_port = wobj->p_port; > + if (!p_port) > + continue; > + if (p_port->p_node->sw) { > + /* for switches - remote switch would be the switch itself */ > + remote_node = osm_physp_get_node_ptr(p_port->p_physp); > + } else { > + p_physp_remote = osm_physp_get_remote(p_port->p_physp); > + remote_node = osm_physp_get_node_ptr(p_physp_remote); > + } > + /* get the remote switch of the mcmember */ > + remote_sw = remote_node->sw; > + port_guid = osm_node_get_node_guid(remote_node); > + if (cl_qmap_get(p_mcast_member_sw_tbl, port_guid) == > + cl_qmap_end(p_mcast_member_sw_tbl)) { > + /* insert switch to table */ > + cl_qmap_insert(p_mcast_member_sw_tbl, port_guid, &remote_sw->mgrp_item); > + /* New element in the table */ > + if (p_port->p_node->sw) > + /* the switch is MC memeber */ > + remote_sw->is_mc_member = 1; > + else > + /* for others - update MC count */ > + remote_sw->num_of_mcm++; > + } > } > + OSM_LOG_EXIT(sm->p_log); > +} > > - /* > - We should be here if there aren't any ports in the group. > - */ > - CL_ASSERT(num_ports); What about well known groups with no current members ? Do those groups not get here or do they have a port ? -- Hal > +static void mcast_mgr_destroy_switch_map(osm_sm_t * sm, > + cl_qmap_t *p_mcast_member_sw_tbl) > +{ > + cl_map_item_t *p_item; > + osm_switch_t *p_sw; > > - if (num_ports != 0) > - avg_hops = (float)(hops / num_ports); > + OSM_LOG_ENTER(sm->p_log); > > + p_item = cl_qmap_head(p_mcast_member_sw_tbl); > + while (p_item != cl_qmap_end(p_mcast_member_sw_tbl)) { > + p_sw = PARENT_STRUCT(p_item, osm_switch_t, mgrp_item); > + p_sw->num_of_mcm = 0; > + p_sw->is_mc_member = 0; > + p_item = cl_qmap_next(p_item); > + } > + cl_qmap_remove_all(p_mcast_member_sw_tbl); > OSM_LOG_EXIT(sm->p_log); > - return avg_hops; > } > > /********************************************************************** > Calculate the maximal "min hops" from the given switch to any > of the group HCAs > **********************************************************************/ > -static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > - const osm_switch_t * p_sw) > +#ifdef OSM_VENDOR_INTF_ANAFA > +static float osm_mcast_mgr_compute_avg_hops(osm_sm_t * sm, cl_qmap_t * m, > + const osm_switch_t * this_sw) > { > - uint32_t max_hops = 0; > + float avg_hops = 0; > uint32_t hops = 0; > - cl_list_item_t *i; > - osm_mcast_work_obj_t *wobj; > + uint32_t num_ports = 0; > + uint16_t lid; > + uint32_t least_hops; > + cl_map_item_t *i; > + osm_switch_t *sw; > + > + OSM_LOG_ENTER(sm->p_log); > + > + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { > + sw = cl_item_obj(i, sw, mcast_item); > + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); > + least_hops = osm_switch_get_least_hops(this_sw, lid); > + /* for all host that are MC members and attached to the switch, > + we should add the (least_hops + 1) * number_of_such_hosts. > + If switch itself is in the MC, we should add the least_hops only */ > + hops += (least_hops + 1) * sw->num_of_mcm + > + least_hops * sw->is_mc_member; > + num_ports += sw->num_of_mcm + sw->is_mc_member; > + } > + > + /* We should be here if there aren't any ports in the group. */ > + CL_ASSERT(num_ports); > + > + avg_hops = (float)(hops / num_ports); > + > + OSM_LOG_EXIT(sm->p_log); > + return avg_hops; > +} > +#else > +static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qmap_t * m, > + const osm_switch_t * this_sw) > +{ > + uint32_t max_hops = 0, hops; > + uint16_t lid; > + cl_map_item_t *i; > + osm_switch_t *sw; > > OSM_LOG_ENTER(sm->p_log); > > @@ -208,9 +277,11 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > For each member of the multicast group, compute the > number of hops to its base LID. > */ > - for (i = cl_qlist_head(l); i != cl_qlist_end(l); i = cl_qlist_next(i)) { > - wobj = cl_item_obj(i, wobj, list_item); > - hops = osm_switch_get_port_least_hops(p_sw, wobj->p_port); > + for (i = cl_qmap_head(m); i != cl_qmap_end(m); i = cl_qmap_next(i)) { > + sw = cl_item_obj(i, sw, mgrp_item); > + lid = cl_ntoh16(osm_node_get_base_lid(sw->p_node, 0)); > + hops = osm_switch_get_least_hops(this_sw, lid); > + hops = (hops + 1) * sw->num_of_mcm + hops * sw->is_mc_member; > if (hops > max_hops) > max_hops = hops; > } > @@ -222,6 +293,7 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > OSM_LOG_EXIT(sm->p_log); > return (float)max_hops; > } > +#endif > > /********************************************************************** > This function attempts to locate the optimal switch for the > @@ -230,32 +302,30 @@ static float osm_mcast_mgr_compute_max_hops(osm_sm_t * sm, cl_qlist_t * l, > of the multicast group. > **********************************************************************/ > static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > - cl_qlist_t *list) > + cl_qlist_t * list) > { > + cl_qmap_t mgrp_sw_map; > cl_qmap_t *p_sw_tbl; > osm_switch_t *p_sw, *p_best_sw = NULL; > float hops = 0; > float best_hops = 10000; /* any big # will do */ > -#ifdef OSM_VENDOR_INTF_ANAFA > - boolean_t use_avg_hops = TRUE; /* anafa2 - bug hca on switch *//* use max hops for root */ > -#else > - boolean_t use_avg_hops = FALSE; /* use max hops for root */ > -#endif > > OSM_LOG_ENTER(sm->p_log); > > p_sw_tbl = &sm->p_subn->sw_guid_tbl; > > + mcast_mgr_build_switch_map(sm, list, &mgrp_sw_map); > for (p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl); > p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl); > p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) { > if (!osm_switch_supports_mcast(p_sw)) > continue; > > - if (use_avg_hops) > - hops = osm_mcast_mgr_compute_avg_hops(sm, list, p_sw); > - else > - hops = osm_mcast_mgr_compute_max_hops(sm, list, p_sw); > +#ifdef OSM_VENDOR_INTF_ANAFA > + hops = osm_mcast_mgr_compute_avg_hops(sm, &mgrp_sw_map, p_sw); > +#else > + hops = osm_mcast_mgr_compute_max_hops(sm, &mgrp_sw_map, p_sw); > +#endif > > OSM_LOG(sm->p_log, OSM_LOG_DEBUG, > "Switch 0x%016" PRIx64 ", hops = %f\n", > @@ -276,6 +346,7 @@ static osm_switch_t *mcast_mgr_find_optimal_switch(osm_sm_t * sm, > OSM_LOG(sm->p_log, OSM_LOG_VERBOSE, > "No multicast capable switches detected\n"); > > + mcast_mgr_destroy_switch_map(sm, &mgrp_sw_map); > OSM_LOG_EXIT(sm->p_log); > return p_best_sw; > } > -- > 1.6.6.1 > > ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <f0e08f231002040622l32c2fc9blc41318828f01cf35-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>]
* Re: [PATCH v4] opensm: Multicast root switch calculation [not found] ` <f0e08f231002040622l32c2fc9blc41318828f01cf35-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org> @ 2010-02-04 18:52 ` Sasha Khapyorsky 0 siblings, 0 replies; 10+ messages in thread From: Sasha Khapyorsky @ 2010-02-04 18:52 UTC (permalink / raw) To: Hal Rosenstock Cc: Slava Strebkov, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Eli Dorfman, Or Gerlitz, Yevgeny Kliteynik On 09:22 Thu 04 Feb , Hal Rosenstock wrote: > > What about well known groups with no current members ? Do those groups > not get here or do they have a port ? Empty MC groups don't participate in routing. Sasha -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2010-02-04 18:52 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-03 14:11 [PATCH v2] opensm: Multicast root switch calculation Slava Strebkov
[not found] ` <4B17C712.9010109-hKgKHo2Ms0F+cjeuK/JdrQ@public.gmane.org>
2010-01-20 10:27 ` Sasha Khapyorsky
2010-01-20 11:32 ` Slava Strebkov
[not found] ` <39C75744D164D948A170E9792AF8E7CA01F6FA8A-QfUkFaTmzUSUvQqKE/ONIwC/G2K4zDHf@public.gmane.org>
2010-01-20 11:59 ` Sasha Khapyorsky
2010-01-27 10:45 ` Sasha Khapyorsky
2010-01-28 15:39 ` Hal Rosenstock
[not found] ` <f0e08f231001280739m31dae78cj5ab7bd78621dcbfc-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-02-03 10:34 ` Sasha Khapyorsky
2010-02-03 10:39 ` [PATCH v4] " Sasha Khapyorsky
2010-02-04 14:22 ` Hal Rosenstock
[not found] ` <f0e08f231002040622l32c2fc9blc41318828f01cf35-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2010-02-04 18:52 ` Sasha Khapyorsky
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).