public inbox for linux-rdma@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH V4] opensm: Move Error printing to MAD error call back functions.
@ 2011-12-16 17:49 Ira Weiny
       [not found] ` <20111216094934.0f7bd2ee.weiny2-i2BcT+NCU+M@public.gmane.org>
  0 siblings, 1 reply; 4+ messages in thread
From: Ira Weiny @ 2011-12-16 17:49 UTC (permalink / raw)
  To: Alex Netes, Hal Rosenstock
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org


Only print the transaction ID of timed out MAD's on VERBOSE.

Signed-off-by: Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org>
---
 libvendor/osm_vendor_ibumad.c |   27 +++------------------------
 opensm/osm_helper.c           |    5 +++--
 opensm/osm_perfmgr.c          |    4 +++-
 opensm/osm_sa_mad_ctrl.c      |   12 +++++++++++-
 opensm/osm_sm_mad_ctrl.c      |   16 ++++++++++++++--
 5 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/libvendor/osm_vendor_ibumad.c b/libvendor/osm_vendor_ibumad.c
index e2ebd8e..00069f5 100644
--- a/libvendor/osm_vendor_ibumad.c
+++ b/libvendor/osm_vendor_ibumad.c
@@ -327,30 +327,9 @@ static void *umad_receiver(void *p_ptr)
 		/* if status != 0 then we are handling recv timeout on send */
 		if (umad_status(p_madw->vend_wrap.umad)) {
 
-			if (mad->mgmt_class != IB_MCLASS_SUBN_DIR) {
-				/* LID routed */
-				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5410: "
-					"Send completed with error -- dropping\n"
-					"\t\t\tClass 0x%x, Method 0x%X, Attr 0x%X, "
-					"TID 0x%" PRIx64 ", LID %u\n",
-					mad->mgmt_class, mad->method,
-					cl_ntoh16(mad->attr_id),
-					cl_ntoh64(mad->trans_id),
-					cl_ntoh16(ib_mad_addr->lid));
-			} else {
-				ib_smp_t *smp;
-
-				/* Direct routed SMP */
-				smp = (ib_smp_t *) mad;
-				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5411: "
-					"DR SMP Send completed with error -- dropping\n"
-					"\t\t\tMethod 0x%X, Attr 0x%X, TID 0x%" PRIx64
-					", Hop Ptr: 0x%X\n",
-					mad->method, cl_ntoh16(mad->attr_id),
-					cl_ntoh64(mad->trans_id), smp->hop_ptr);
-				osm_dump_smp_dr_path(p_vend->p_log, smp,
-						     OSM_LOG_ERROR);
-			}
+			OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: "
+				"Receive Timeout on Send -- dropping "
+				"TID 0x%" PRIx64 "\n", cl_ntoh64(mad->trans_id));
 
 			if (!(p_req_madw = get_madw(p_vend, &mad->trans_id))) {
 				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c
index f9f3d9d..b968679 100644
--- a/opensm/osm_helper.c
+++ b/opensm/osm_helper.c
@@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, IN const ib_smp_t * p_smp,
 		char buf[BUF_SIZE];
 		unsigned n;
 
-		n = sprintf(buf, "Received SMP on a %u hop path: "
-			    "Initial path = ", p_smp->hop_count);
+		n = sprintf(buf, "   DR SMP (TID 0x%" PRIx64 ") on a %u hop path: "
+			    "Initial path = ",
+			    cl_ntoh64(p_smp->trans_id), p_smp->hop_count);
 		n += sprint_uint8_arr(buf + n, sizeof(buf) - n,
 				      p_smp->initial_path,
 				      p_smp->hop_count + 1);
diff --git a/opensm/osm_perfmgr.c b/opensm/osm_perfmgr.c
index ded5a5e..fc3f74b 100644
--- a/opensm/osm_perfmgr.c
+++ b/opensm/osm_perfmgr.c
@@ -212,7 +212,9 @@ static void perfmgr_mad_send_err_callback(void *bind_context,
 	p_mon_node = (monitored_node_t *) p_node;
 
 	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
-		") port %u\n", p_mon_node->name, p_mon_node->guid, port);
+		") port %u; DLID %u, TID 0x%" PRIx64 "\n", p_mon_node->name,
+		p_mon_node->guid, port, cl_ntoh16(p_madw->mad_addr.dest_lid),
+		cl_ntoh64(p_madw->p_mad->trans_id));
 
 	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
 		/* First, find the node in the monitored map */
diff --git a/opensm/osm_sa_mad_ctrl.c b/opensm/osm_sa_mad_ctrl.c
index bde88fa..4caead1 100644
--- a/opensm/osm_sa_mad_ctrl.c
+++ b/opensm/osm_sa_mad_ctrl.c
@@ -413,8 +413,18 @@ static void sa_mad_ctrl_send_err_callback(IN void *context,
 	   Retire the original request MAD.
 	 */
 
+	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 1A0A: "
+		"SA MAD completed in error (%s): "
+		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 ", DLID %u\n",
+		ib_get_err_str(p_madw->status),
+		ib_get_sa_method_str(p_madw->p_mad->method),
+		ib_get_sa_attr_str(p_madw->p_mad->attr_id),
+		cl_ntoh32(p_madw->p_mad->attr_mod),
+		cl_ntoh64(p_madw->p_mad->trans_id),
+		cl_ntoh16(p_madw->mad_addr.dest_lid));
+
 	osm_dump_sa_mad(p_ctrl->p_log, osm_madw_get_sa_mad_ptr(p_madw),
-			OSM_LOG_ERROR);
+			OSM_LOG_VERBOSE);
 
 	/*  sm_mad_ctrl_update_wire_stats( p_ctrl ); */
 
diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c
index ee92c66..a3b444a 100644
--- a/opensm/osm_sm_mad_ctrl.c
+++ b/opensm/osm_sm_mad_ctrl.c
@@ -704,6 +704,7 @@ Exit:
  */
 static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
 {
+	char lidstr[8];
 	osm_sm_mad_ctrl_t *p_ctrl = context;
 	ib_api_status_t status;
 	ib_smp_t *p_smp;
@@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
 	CL_ASSERT(p_madw);
 
 	p_smp = osm_madw_get_smp_ptr(p_madw);
+
+	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
+		lidstr[0] = '\0';
+	else
+		snprintf(lidstr, 8, " DLID %u",
+			cl_ntoh16(p_madw->mad_addr.dest_lid));
+
 	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: "
 		"MAD completed in error (%s): "
-		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n",
+		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n",
 		ib_get_err_str(p_madw->status),
 		ib_get_sm_method_str(p_smp->method),
 		ib_get_sm_attr_str(p_smp->attr_id), cl_ntoh32(p_smp->attr_mod),
-		cl_ntoh64(p_smp->trans_id));
+		cl_ntoh64(p_smp->trans_id),
+		lidstr);
+
+	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
+		osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, OSM_LOG_ERROR);
 
 	/*
 	   If this was a SubnSet MAD, then this error might indicate a problem
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH V4] opensm: Move Error printing to MAD error call back functions.
       [not found] ` <20111216094934.0f7bd2ee.weiny2-i2BcT+NCU+M@public.gmane.org>
@ 2011-12-16 19:27   ` Hal Rosenstock
       [not found]     ` <4EEB9BAC.5040104-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 4+ messages in thread
From: Hal Rosenstock @ 2011-12-16 19:27 UTC (permalink / raw)
  To: Ira Weiny; +Cc: Alex Netes, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On 12/16/2011 12:49 PM, Ira Weiny wrote:
> 
> Only print the transaction ID of timed out MAD's on VERBOSE.
> 
> Signed-off-by: Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org>
> ---
>  libvendor/osm_vendor_ibumad.c |   27 +++------------------------
>  opensm/osm_helper.c           |    5 +++--
>  opensm/osm_perfmgr.c          |    4 +++-
>  opensm/osm_sa_mad_ctrl.c      |   12 +++++++++++-
>  opensm/osm_sm_mad_ctrl.c      |   16 ++++++++++++++--
>  5 files changed, 34 insertions(+), 30 deletions(-)
> 
> diff --git a/libvendor/osm_vendor_ibumad.c b/libvendor/osm_vendor_ibumad.c
> index e2ebd8e..00069f5 100644
> --- a/libvendor/osm_vendor_ibumad.c
> +++ b/libvendor/osm_vendor_ibumad.c
> @@ -327,30 +327,9 @@ static void *umad_receiver(void *p_ptr)
>  		/* if status != 0 then we are handling recv timeout on send */
>  		if (umad_status(p_madw->vend_wrap.umad)) {
>  
> -			if (mad->mgmt_class != IB_MCLASS_SUBN_DIR) {
> -				/* LID routed */
> -				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5410: "
> -					"Send completed with error -- dropping\n"
> -					"\t\t\tClass 0x%x, Method 0x%X, Attr 0x%X, "
> -					"TID 0x%" PRIx64 ", LID %u\n",
> -					mad->mgmt_class, mad->method,
> -					cl_ntoh16(mad->attr_id),
> -					cl_ntoh64(mad->trans_id),
> -					cl_ntoh16(ib_mad_addr->lid));
> -			} else {
> -				ib_smp_t *smp;
> -
> -				/* Direct routed SMP */
> -				smp = (ib_smp_t *) mad;
> -				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5411: "
> -					"DR SMP Send completed with error -- dropping\n"
> -					"\t\t\tMethod 0x%X, Attr 0x%X, TID 0x%" PRIx64
> -					", Hop Ptr: 0x%X\n",
> -					mad->method, cl_ntoh16(mad->attr_id),
> -					cl_ntoh64(mad->trans_id), smp->hop_ptr);

One thing I just noticed in cobbling up the other approach for
comparison purposes is that the logging of the hop pointer was removed.
Should that be preserved ?

-- Hal

> -				osm_dump_smp_dr_path(p_vend->p_log, smp,
> -						     OSM_LOG_ERROR);
> -			}
> +			OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: "
> +				"Receive Timeout on Send -- dropping "
> +				"TID 0x%" PRIx64 "\n", cl_ntoh64(mad->trans_id));
>  
>  			if (!(p_req_madw = get_madw(p_vend, &mad->trans_id))) {
>  				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
> diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c
> index f9f3d9d..b968679 100644
> --- a/opensm/osm_helper.c
> +++ b/opensm/osm_helper.c
> @@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, IN const ib_smp_t * p_smp,
>  		char buf[BUF_SIZE];
>  		unsigned n;
>  
> -		n = sprintf(buf, "Received SMP on a %u hop path: "
> -			    "Initial path = ", p_smp->hop_count);
> +		n = sprintf(buf, "   DR SMP (TID 0x%" PRIx64 ") on a %u hop path: "
> +			    "Initial path = ",
> +			    cl_ntoh64(p_smp->trans_id), p_smp->hop_count);
>  		n += sprint_uint8_arr(buf + n, sizeof(buf) - n,
>  				      p_smp->initial_path,
>  				      p_smp->hop_count + 1);
> diff --git a/opensm/osm_perfmgr.c b/opensm/osm_perfmgr.c
> index ded5a5e..fc3f74b 100644
> --- a/opensm/osm_perfmgr.c
> +++ b/opensm/osm_perfmgr.c
> @@ -212,7 +212,9 @@ static void perfmgr_mad_send_err_callback(void *bind_context,
>  	p_mon_node = (monitored_node_t *) p_node;
>  
>  	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
> -		") port %u\n", p_mon_node->name, p_mon_node->guid, port);
> +		") port %u; DLID %u, TID 0x%" PRIx64 "\n", p_mon_node->name,
> +		p_mon_node->guid, port, cl_ntoh16(p_madw->mad_addr.dest_lid),
> +		cl_ntoh64(p_madw->p_mad->trans_id));
>  
>  	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
>  		/* First, find the node in the monitored map */
> diff --git a/opensm/osm_sa_mad_ctrl.c b/opensm/osm_sa_mad_ctrl.c
> index bde88fa..4caead1 100644
> --- a/opensm/osm_sa_mad_ctrl.c
> +++ b/opensm/osm_sa_mad_ctrl.c
> @@ -413,8 +413,18 @@ static void sa_mad_ctrl_send_err_callback(IN void *context,
>  	   Retire the original request MAD.
>  	 */
>  
> +	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 1A0A: "
> +		"SA MAD completed in error (%s): "
> +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 ", DLID %u\n",
> +		ib_get_err_str(p_madw->status),
> +		ib_get_sa_method_str(p_madw->p_mad->method),
> +		ib_get_sa_attr_str(p_madw->p_mad->attr_id),
> +		cl_ntoh32(p_madw->p_mad->attr_mod),
> +		cl_ntoh64(p_madw->p_mad->trans_id),
> +		cl_ntoh16(p_madw->mad_addr.dest_lid));
> +
>  	osm_dump_sa_mad(p_ctrl->p_log, osm_madw_get_sa_mad_ptr(p_madw),
> -			OSM_LOG_ERROR);
> +			OSM_LOG_VERBOSE);
>  
>  	/*  sm_mad_ctrl_update_wire_stats( p_ctrl ); */
>  
> diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c
> index ee92c66..a3b444a 100644
> --- a/opensm/osm_sm_mad_ctrl.c
> +++ b/opensm/osm_sm_mad_ctrl.c
> @@ -704,6 +704,7 @@ Exit:
>   */
>  static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
>  {
> +	char lidstr[8];
>  	osm_sm_mad_ctrl_t *p_ctrl = context;
>  	ib_api_status_t status;
>  	ib_smp_t *p_smp;
> @@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
>  	CL_ASSERT(p_madw);
>  
>  	p_smp = osm_madw_get_smp_ptr(p_madw);
> +
> +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> +		lidstr[0] = '\0';
> +	else
> +		snprintf(lidstr, 8, " DLID %u",
> +			cl_ntoh16(p_madw->mad_addr.dest_lid));
> +
>  	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: "
>  		"MAD completed in error (%s): "
> -		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n",
> +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n",
>  		ib_get_err_str(p_madw->status),
>  		ib_get_sm_method_str(p_smp->method),
>  		ib_get_sm_attr_str(p_smp->attr_id), cl_ntoh32(p_smp->attr_mod),
> -		cl_ntoh64(p_smp->trans_id));
> +		cl_ntoh64(p_smp->trans_id),
> +		lidstr);
> +
> +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> +		osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, OSM_LOG_ERROR);
>  
>  	/*
>  	   If this was a SubnSet MAD, then this error might indicate a problem

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V4] opensm: Move Error printing to MAD error call back functions.
       [not found]     ` <4EEB9BAC.5040104-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2011-12-16 19:45       ` Ira Weiny
       [not found]         ` <20111216114557.27cef81e.weiny2-i2BcT+NCU+M@public.gmane.org>
  0 siblings, 1 reply; 4+ messages in thread
From: Ira Weiny @ 2011-12-16 19:45 UTC (permalink / raw)
  To: Hal Rosenstock
  Cc: Alex Netes, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On Fri, 16 Dec 2011 11:27:40 -0800
Hal Rosenstock <hal-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:

> On 12/16/2011 12:49 PM, Ira Weiny wrote:
> > 
> > Only print the transaction ID of timed out MAD's on VERBOSE.
> > 
> > Signed-off-by: Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org>
> > ---

[snip]

> > -			} else {
> > -				ib_smp_t *smp;
> > -
> > -				/* Direct routed SMP */
> > -				smp = (ib_smp_t *) mad;
> > -				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5411: "
> > -					"DR SMP Send completed with error -- dropping\n"
> > -					"\t\t\tMethod 0x%X, Attr 0x%X, TID 0x%" PRIx64
> > -					", Hop Ptr: 0x%X\n",
> > -					mad->method, cl_ntoh16(mad->attr_id),
> > -					cl_ntoh64(mad->trans_id), smp->hop_ptr);
> 
> One thing I just noticed in cobbling up the other approach for
> comparison purposes is that the logging of the hop pointer was removed.
> Should that be preserved ?

Since we are printing the request MAD wouldn't the hop pointer always be 0?

And while we are at it we should print the DLID/SLID since it could be combined routing.

Ira

> 
> -- Hal
> 
> > -				osm_dump_smp_dr_path(p_vend->p_log, smp,
> > -						     OSM_LOG_ERROR);
> > -			}
> > +			OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: "
> > +				"Receive Timeout on Send -- dropping "
> > +				"TID 0x%" PRIx64 "\n", cl_ntoh64(mad->trans_id));
> >  
> >  			if (!(p_req_madw = get_madw(p_vend, &mad->trans_id))) {
> >  				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
> > diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c
> > index f9f3d9d..b968679 100644
> > --- a/opensm/osm_helper.c
> > +++ b/opensm/osm_helper.c
> > @@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, IN const ib_smp_t * p_smp,
> >  		char buf[BUF_SIZE];
> >  		unsigned n;
> >  
> > -		n = sprintf(buf, "Received SMP on a %u hop path: "
> > -			    "Initial path = ", p_smp->hop_count);
> > +		n = sprintf(buf, "   DR SMP (TID 0x%" PRIx64 ") on a %u hop path: "
> > +			    "Initial path = ",
> > +			    cl_ntoh64(p_smp->trans_id), p_smp->hop_count);
> >  		n += sprint_uint8_arr(buf + n, sizeof(buf) - n,
> >  				      p_smp->initial_path,
> >  				      p_smp->hop_count + 1);
> > diff --git a/opensm/osm_perfmgr.c b/opensm/osm_perfmgr.c
> > index ded5a5e..fc3f74b 100644
> > --- a/opensm/osm_perfmgr.c
> > +++ b/opensm/osm_perfmgr.c
> > @@ -212,7 +212,9 @@ static void perfmgr_mad_send_err_callback(void *bind_context,
> >  	p_mon_node = (monitored_node_t *) p_node;
> >  
> >  	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
> > -		") port %u\n", p_mon_node->name, p_mon_node->guid, port);
> > +		") port %u; DLID %u, TID 0x%" PRIx64 "\n", p_mon_node->name,
> > +		p_mon_node->guid, port, cl_ntoh16(p_madw->mad_addr.dest_lid),
> > +		cl_ntoh64(p_madw->p_mad->trans_id));
> >  
> >  	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
> >  		/* First, find the node in the monitored map */
> > diff --git a/opensm/osm_sa_mad_ctrl.c b/opensm/osm_sa_mad_ctrl.c
> > index bde88fa..4caead1 100644
> > --- a/opensm/osm_sa_mad_ctrl.c
> > +++ b/opensm/osm_sa_mad_ctrl.c
> > @@ -413,8 +413,18 @@ static void sa_mad_ctrl_send_err_callback(IN void *context,
> >  	   Retire the original request MAD.
> >  	 */
> >  
> > +	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 1A0A: "
> > +		"SA MAD completed in error (%s): "
> > +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 ", DLID %u\n",
> > +		ib_get_err_str(p_madw->status),
> > +		ib_get_sa_method_str(p_madw->p_mad->method),
> > +		ib_get_sa_attr_str(p_madw->p_mad->attr_id),
> > +		cl_ntoh32(p_madw->p_mad->attr_mod),
> > +		cl_ntoh64(p_madw->p_mad->trans_id),
> > +		cl_ntoh16(p_madw->mad_addr.dest_lid));
> > +
> >  	osm_dump_sa_mad(p_ctrl->p_log, osm_madw_get_sa_mad_ptr(p_madw),
> > -			OSM_LOG_ERROR);
> > +			OSM_LOG_VERBOSE);
> >  
> >  	/*  sm_mad_ctrl_update_wire_stats( p_ctrl ); */
> >  
> > diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c
> > index ee92c66..a3b444a 100644
> > --- a/opensm/osm_sm_mad_ctrl.c
> > +++ b/opensm/osm_sm_mad_ctrl.c
> > @@ -704,6 +704,7 @@ Exit:
> >   */
> >  static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
> >  {
> > +	char lidstr[8];
> >  	osm_sm_mad_ctrl_t *p_ctrl = context;
> >  	ib_api_status_t status;
> >  	ib_smp_t *p_smp;
> > @@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
> >  	CL_ASSERT(p_madw);
> >  
> >  	p_smp = osm_madw_get_smp_ptr(p_madw);
> > +
> > +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> > +		lidstr[0] = '\0';
> > +	else
> > +		snprintf(lidstr, 8, " DLID %u",
> > +			cl_ntoh16(p_madw->mad_addr.dest_lid));
> > +
> >  	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: "
> >  		"MAD completed in error (%s): "
> > -		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n",
> > +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n",
> >  		ib_get_err_str(p_madw->status),
> >  		ib_get_sm_method_str(p_smp->method),
> >  		ib_get_sm_attr_str(p_smp->attr_id), cl_ntoh32(p_smp->attr_mod),
> > -		cl_ntoh64(p_smp->trans_id));
> > +		cl_ntoh64(p_smp->trans_id),
> > +		lidstr);
> > +
> > +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
> > +		osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, OSM_LOG_ERROR);
> >  
> >  	/*
> >  	   If this was a SubnSet MAD, then this error might indicate a problem
> 


-- 
Ira Weiny
Math Programmer/Computer Scientist
Lawrence Livermore National Lab
925-423-8008
weiny2-i2BcT+NCU+M@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V4] opensm: Move Error printing to MAD error call back functions.
       [not found]         ` <20111216114557.27cef81e.weiny2-i2BcT+NCU+M@public.gmane.org>
@ 2011-12-16 20:19           ` Hal Rosenstock
  0 siblings, 0 replies; 4+ messages in thread
From: Hal Rosenstock @ 2011-12-16 20:19 UTC (permalink / raw)
  To: Ira Weiny; +Cc: Alex Netes, linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

On 12/16/2011 2:45 PM, Ira Weiny wrote:
> On Fri, 16 Dec 2011 11:27:40 -0800
> Hal Rosenstock <hal-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
> 
>> On 12/16/2011 12:49 PM, Ira Weiny wrote:
>>>
>>> Only print the transaction ID of timed out MAD's on VERBOSE.
>>>
>>> Signed-off-by: Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org>
>>> ---
> 
> [snip]
> 
>>> -			} else {
>>> -				ib_smp_t *smp;
>>> -
>>> -				/* Direct routed SMP */
>>> -				smp = (ib_smp_t *) mad;
>>> -				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR, "ERR 5411: "
>>> -					"DR SMP Send completed with error -- dropping\n"
>>> -					"\t\t\tMethod 0x%X, Attr 0x%X, TID 0x%" PRIx64
>>> -					", Hop Ptr: 0x%X\n",
>>> -					mad->method, cl_ntoh16(mad->attr_id),
>>> -					cl_ntoh64(mad->trans_id), smp->hop_ptr);
>>
>> One thing I just noticed in cobbling up the other approach for
>> comparison purposes is that the logging of the hop pointer was removed.
>> Should that be preserved ?
> 
> Since we are printing the request MAD wouldn't the hop pointer always be 0?

Yes, that should be the case but I don't recall whether the original
print always showed 0 when working off the returned umad rather than the
request MAD. So I guess that should be eliminated in the vendor umad
approach this...

> And while we are at it we should print the DLID/SLID since it could be combined routing.

AFAIK OpenSM doesn't currently support combined routing (only the diags
do). This is a bigger change and that change should include any changes
needed for logging.

-- Hal

> Ira
> 
>>
>> -- Hal
>>
>>> -				osm_dump_smp_dr_path(p_vend->p_log, smp,
>>> -						     OSM_LOG_ERROR);
>>> -			}
>>> +			OSM_LOG(p_vend->p_log, OSM_LOG_VERBOSE, "ERR 5410: "
>>> +				"Receive Timeout on Send -- dropping "
>>> +				"TID 0x%" PRIx64 "\n", cl_ntoh64(mad->trans_id));
>>>  
>>>  			if (!(p_req_madw = get_madw(p_vend, &mad->trans_id))) {
>>>  				OSM_LOG(p_vend->p_log, OSM_LOG_ERROR,
>>> diff --git a/opensm/osm_helper.c b/opensm/osm_helper.c
>>> index f9f3d9d..b968679 100644
>>> --- a/opensm/osm_helper.c
>>> +++ b/opensm/osm_helper.c
>>> @@ -2059,8 +2059,9 @@ void osm_dump_smp_dr_path(IN osm_log_t * p_log, IN const ib_smp_t * p_smp,
>>>  		char buf[BUF_SIZE];
>>>  		unsigned n;
>>>  
>>> -		n = sprintf(buf, "Received SMP on a %u hop path: "
>>> -			    "Initial path = ", p_smp->hop_count);
>>> +		n = sprintf(buf, "   DR SMP (TID 0x%" PRIx64 ") on a %u hop path: "
>>> +			    "Initial path = ",
>>> +			    cl_ntoh64(p_smp->trans_id), p_smp->hop_count);
>>>  		n += sprint_uint8_arr(buf + n, sizeof(buf) - n,
>>>  				      p_smp->initial_path,
>>>  				      p_smp->hop_count + 1);
>>> diff --git a/opensm/osm_perfmgr.c b/opensm/osm_perfmgr.c
>>> index ded5a5e..fc3f74b 100644
>>> --- a/opensm/osm_perfmgr.c
>>> +++ b/opensm/osm_perfmgr.c
>>> @@ -212,7 +212,9 @@ static void perfmgr_mad_send_err_callback(void *bind_context,
>>>  	p_mon_node = (monitored_node_t *) p_node;
>>>  
>>>  	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
>>> -		") port %u\n", p_mon_node->name, p_mon_node->guid, port);
>>> +		") port %u; DLID %u, TID 0x%" PRIx64 "\n", p_mon_node->name,
>>> +		p_mon_node->guid, port, cl_ntoh16(p_madw->mad_addr.dest_lid),
>>> +		cl_ntoh64(p_madw->p_mad->trans_id));
>>>  
>>>  	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
>>>  		/* First, find the node in the monitored map */
>>> diff --git a/opensm/osm_sa_mad_ctrl.c b/opensm/osm_sa_mad_ctrl.c
>>> index bde88fa..4caead1 100644
>>> --- a/opensm/osm_sa_mad_ctrl.c
>>> +++ b/opensm/osm_sa_mad_ctrl.c
>>> @@ -413,8 +413,18 @@ static void sa_mad_ctrl_send_err_callback(IN void *context,
>>>  	   Retire the original request MAD.
>>>  	 */
>>>  
>>> +	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 1A0A: "
>>> +		"SA MAD completed in error (%s): "
>>> +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 ", DLID %u\n",
>>> +		ib_get_err_str(p_madw->status),
>>> +		ib_get_sa_method_str(p_madw->p_mad->method),
>>> +		ib_get_sa_attr_str(p_madw->p_mad->attr_id),
>>> +		cl_ntoh32(p_madw->p_mad->attr_mod),
>>> +		cl_ntoh64(p_madw->p_mad->trans_id),
>>> +		cl_ntoh16(p_madw->mad_addr.dest_lid));
>>> +
>>>  	osm_dump_sa_mad(p_ctrl->p_log, osm_madw_get_sa_mad_ptr(p_madw),
>>> -			OSM_LOG_ERROR);
>>> +			OSM_LOG_VERBOSE);
>>>  
>>>  	/*  sm_mad_ctrl_update_wire_stats( p_ctrl ); */
>>>  
>>> diff --git a/opensm/osm_sm_mad_ctrl.c b/opensm/osm_sm_mad_ctrl.c
>>> index ee92c66..a3b444a 100644
>>> --- a/opensm/osm_sm_mad_ctrl.c
>>> +++ b/opensm/osm_sm_mad_ctrl.c
>>> @@ -704,6 +704,7 @@ Exit:
>>>   */
>>>  static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
>>>  {
>>> +	char lidstr[8];
>>>  	osm_sm_mad_ctrl_t *p_ctrl = context;
>>>  	ib_api_status_t status;
>>>  	ib_smp_t *p_smp;
>>> @@ -713,13 +714,24 @@ static void sm_mad_ctrl_send_err_cb(IN void *context, IN osm_madw_t * p_madw)
>>>  	CL_ASSERT(p_madw);
>>>  
>>>  	p_smp = osm_madw_get_smp_ptr(p_madw);
>>> +
>>> +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
>>> +		lidstr[0] = '\0';
>>> +	else
>>> +		snprintf(lidstr, 8, " DLID %u",
>>> +			cl_ntoh16(p_madw->mad_addr.dest_lid));
>>> +
>>>  	OSM_LOG(p_ctrl->p_log, OSM_LOG_ERROR, "ERR 3113: "
>>>  		"MAD completed in error (%s): "
>>> -		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 "\n",
>>> +		"%s(%s), attr_mod 0x%x, TID 0x%" PRIx64 " %s\n",
>>>  		ib_get_err_str(p_madw->status),
>>>  		ib_get_sm_method_str(p_smp->method),
>>>  		ib_get_sm_attr_str(p_smp->attr_id), cl_ntoh32(p_smp->attr_mod),
>>> -		cl_ntoh64(p_smp->trans_id));
>>> +		cl_ntoh64(p_smp->trans_id),
>>> +		lidstr);
>>> +
>>> +	if (p_smp->mgmt_class == IB_MCLASS_SUBN_DIR)
>>> +		osm_dump_smp_dr_path(p_ctrl->p_log, p_smp, OSM_LOG_ERROR);
>>>  
>>>  	/*
>>>  	   If this was a SubnSet MAD, then this error might indicate a problem
>>
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2011-12-16 20:19 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-16 17:49 [PATCH V4] opensm: Move Error printing to MAD error call back functions Ira Weiny
     [not found] ` <20111216094934.0f7bd2ee.weiny2-i2BcT+NCU+M@public.gmane.org>
2011-12-16 19:27   ` Hal Rosenstock
     [not found]     ` <4EEB9BAC.5040104-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2011-12-16 19:45       ` Ira Weiny
     [not found]         ` <20111216114557.27cef81e.weiny2-i2BcT+NCU+M@public.gmane.org>
2011-12-16 20:19           ` Hal Rosenstock

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox