Linux RDMA and InfiniBand development
 help / color / mirror / Atom feed
From: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org>
To: Arthur Kepner <akepner-sJ/iWh9BUns@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Dale.R.Talcott-NSQ8wuThN14@public.gmane.org
Subject: Re: [PATCH/RFC] opensm: toggle sweeping V2
Date: Sat, 22 May 2010 20:04:31 +0300	[thread overview]
Message-ID: <20100522170431.GU28549@me> (raw)
In-Reply-To: <20100519235727.GP7678-sJ/iWh9BUns@public.gmane.org>

On 16:57 Wed 19 May     , Arthur Kepner wrote:
> 
> One of our customers recently merged some new systems into a 
> large, existing cluster. They requested a mechanism to prevent 
> opensm from sweeping while the new equipment was being added to 
> the IB fabric, and then resume sweeping once they felt confident 
> that the newly added (sub)fabric was correctly cabled, and fully 
> functional. They used something similar to the following patch. 
> 
> Comments?

I still not understand what is wrong with running OpenSM with sweep
disabled and restarting when a fabric is ready. But anyway a new
console command looks less aggressive for me than signaling... :)

> Signed-off-by: Arthur Kepner <akepner-sJ/iWh9BUns@public.gmane.org>

The questions about patch is below.

> 
> --- 
> 
>  include/opensm/osm_subnet.h |    6 ++++++
>  opensm/osm_console.c        |   32 ++++++++++++++++++++++++++++++++
>  opensm/osm_state_mgr.c      |    8 +++++++-
>  opensm/osm_subnet.c         |    1 +
>  opensm/osm_trap_rcv.c       |   35 +++++++++++++++++++++--------------
>  5 files changed, 67 insertions(+), 15 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h
> index d79ed8f..2a1db99 100644
> --- a/opensm/include/opensm/osm_subnet.h
> +++ b/opensm/include/opensm/osm_subnet.h
> @@ -532,6 +532,7 @@ typedef struct osm_subn {
>  	boolean_t in_sweep_hop_0;
>  	boolean_t first_time_master_sweep;
>  	boolean_t coming_out_of_standby;
> +	boolean_t sweeping_enabled;
>  	unsigned need_update;
>  	cl_fmap_t mgrp_mgid_tbl;
>  	void *mboxes[IB_LID_MCAST_END_HO - IB_LID_MCAST_START_HO + 1];
> @@ -651,6 +652,11 @@ typedef struct osm_subn {
>  *		The flag is set true if the SM state was standby and now
>  *		changed to MASTER it is reset at the end of the sweep.
>  *
> +*	sweeping_enabled
> +*		FALSE - sweeping is administratively disabled, all
> +*		sweeping is inhibited, TRUE - sweeping is done
> +*		normally
> +*
>  *	need_update
>  *		This flag should be on during first non-master heavy
>  *		(including pre-master discovery stage)
> diff --git a/opensm/opensm/osm_console.c b/opensm/opensm/osm_console.c
> index 968486e..bc7bea3 100644
> --- a/opensm/opensm/osm_console.c
> +++ b/opensm/opensm/osm_console.c
> @@ -150,6 +150,16 @@ static void help_reroute(FILE * out, int detail)
>  	}
>  }
>  
> +static void help_sweep(FILE * out, int detail)
> +{
> +	fprintf(out, "sweep [on|off]\n");
> +	if (detail) {
> +		fprintf(out, "enable or disable sweeping\n");
> +		fprintf(out, "   [on] sweep normally\n");
> +		fprintf(out, "   [off] inhibit all sweeping\n");
> +	}
> +}
> +
>  static void help_status(FILE * out, int detail)
>  {
>  	fprintf(out, "status [loop]\n");
> @@ -427,11 +437,15 @@ static void print_status(osm_opensm_t * p_osm, FILE * out)
>  			p_osm->stats.sa_mads_ignored);
>  		fprintf(out, "\n   Subnet flags\n"
>  			"   ------------\n"
> +			"   Sweeping enabled               : %d\n"
> +			"   Sweep interval (seconds)       : %d\n"
>  			"   Ignore existing lfts           : %d\n"
>  			"   Subnet Init errors             : %d\n"
>  			"   In sweep hop 0                 : %d\n"
>  			"   First time master sweep        : %d\n"
>  			"   Coming out of standby          : %d\n",
> +			p_osm->subn.sweeping_enabled,
> +			p_osm->subn.opt.sweep_interval,
>  			p_osm->subn.ignore_existing_lfts,
>  			p_osm->subn.subnet_initialization_error,
>  			p_osm->subn.in_sweep_hop_0,
> @@ -495,6 +509,23 @@ static void reroute_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
>  	osm_opensm_sweep(p_osm);
>  }
>  
> +static void sweep_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
> +{
> +	char *p_cmd;
> +
> +	p_cmd = next_token(p_last);
> +	if (!p_cmd ||
> +	    (strcmp(p_cmd, "on") != 0 && strcmp(p_cmd, "off") != 0)) {
> +		fprintf(out, "Invalid sweep command\n");
> +		help_sweep(out, 1);
> +	} else {
> +		if (strcmp(p_cmd, "on") == 0)
> +			p_osm->subn.sweeping_enabled = TRUE;
> +		else
> +			p_osm->subn.sweeping_enabled = FALSE;
> +	}
> +}
> +
>  static void logflush_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
>  {
>  	fflush(p_osm->log.out_port);
> @@ -1332,6 +1363,7 @@ static const struct command console_cmds[] = {
>  	{"priority", &help_priority, &priority_parse},
>  	{"resweep", &help_resweep, &resweep_parse},
>  	{"reroute", &help_reroute, &reroute_parse},
> +	{"sweep", &help_sweep, &sweep_parse},
>  	{"status", &help_status, &status_parse},
>  	{"logflush", &help_logflush, &logflush_parse},
>  	{"querylid", &help_querylid, &querylid_parse},
> diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> index e43463f..81c8f54 100644
> --- a/opensm/opensm/osm_state_mgr.c
> +++ b/opensm/opensm/osm_state_mgr.c
> @@ -1415,7 +1415,13 @@ void osm_state_mgr_process(IN osm_sm_t * sm, IN osm_signal_t signal)
>  
>  	switch (signal) {
>  	case OSM_SIGNAL_SWEEP:
> -		do_sweep(sm);
> +		if (!sm->p_subn->sweeping_enabled) {
> +			OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "sweeping disabled - "
> +				"ignoring signal %s in state %s\n",
> +				osm_get_sm_signal_str(signal),
> +				osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
> +		} else
> +			do_sweep(sm);
>  		break;
>  	case OSM_SIGNAL_IDLE_TIME_PROCESS_REQUEST:
>  		do_process_mgrp_queue(sm);
> diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
> index ac8cb37..ba2c812 100644
> --- a/opensm/opensm/osm_subnet.c
> +++ b/opensm/opensm/osm_subnet.c
> @@ -531,6 +531,7 @@ ib_api_status_t osm_subn_init(IN osm_subn_t * p_subn, IN osm_opensm_t * p_osm,
>  
>  	/* we assume master by default - so we only need to set it true if STANDBY */
>  	p_subn->coming_out_of_standby = FALSE;
> +	p_subn->sweeping_enabled = TRUE;
>  
>  	return IB_SUCCESS;
>  }
> diff --git a/opensm/opensm/osm_trap_rcv.c b/opensm/opensm/osm_trap_rcv.c
> index bf13239..ba366a9 100644
> --- a/opensm/opensm/osm_trap_rcv.c
> +++ b/opensm/opensm/osm_trap_rcv.c
> @@ -515,23 +515,30 @@ static void trap_rcv_process_request(IN osm_sm_t * sm,
>  check_sweep:
>  	/* do a sweep if we received a trap */
>  	if (sm->p_subn->opt.sweep_on_trap) {

> -		/* if this is trap number 128 or run_heavy_sweep is TRUE -
> -		   update the force_heavy_sweep flag of the subnet.
> -		   Sweep also on traps 144 - these traps signal a change of
> -		   certain port capabilities.
> -		   TODO: In the future this can be changed to just getting
> -		   PortInfo on this port instead of sweeping the entire subnet. */
> -		if (ib_notice_is_generic(p_ntci) &&
> -		    (cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 128 ||
> -		     cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 144 ||
> -		     run_heavy_sweep)) {
> -			OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
> -				"Forcing heavy sweep. Received trap:%u\n",
> +		if (!sm->p_subn->sweeping_enabled) {
> +			OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> +				"sweeping disabled - ignoring trap %u\n",
>  				cl_ntoh16(p_ntci->g_or_v.generic.trap_num));

Isn't this case already handled in osm_state_mgr_process() and this code
addition in osm_trap_rcv.c redundant?

And if it is not. Wouldn't it be simpler to check:

  	if (sm->p_subn->opt.sweep_on_trap && sm->p_subn->sweeping_enabled) {

in order to minimize the change set?

> +		} else {
> +			/* if this is trap number 128 or run_heavy_sweep is
> +			   TRUE - update the force_heavy_sweep flag of the
> +			   subnet.  Sweep also on traps 144 - these traps
> +			   signal a change of certain port capabilities.
> +			   TODO: In the future this can be changed to just
> +			   getting PortInfo on this port instead of sweeping
> +			   the entire subnet. */
> +			if (ib_notice_is_generic(p_ntci) &&
> +			    (cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 128 ||
> +			     cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 144 ||
> +			     run_heavy_sweep)) {
> +				OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
> +					"Forcing heavy sweep. Received trap:%u\n",
> +					cl_ntoh16(p_ntci->g_or_v.generic.trap_num));
>  
> -			sm->p_subn->force_heavy_sweep = TRUE;
> +				sm->p_subn->force_heavy_sweep = TRUE;
> +			}
> +			osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
>  		}
> -		osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
>  	}
>  
>  	/* If we reached here due to trap 129/130/131 - do not need to do
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

  parent reply	other threads:[~2010-05-22 17:04 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-05-19 23:57 [PATCH/RFC] opensm: toggle sweeping V2 Arthur Kepner
     [not found] ` <20100519235727.GP7678-sJ/iWh9BUns@public.gmane.org>
2010-05-22 17:04   ` Sasha Khapyorsky [this message]
2010-05-24 21:18     ` Arthur Kepner
     [not found]       ` <20100524211830.GJ2678-sJ/iWh9BUns@public.gmane.org>
2010-05-25 15:14         ` Sasha Khapyorsky
2010-05-25 16:50           ` Arthur Kepner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100522170431.GU28549@me \
    --to=sashak-smomgflxvozwk0htik3j/w@public.gmane.org \
    --cc=Dale.R.Talcott-NSQ8wuThN14@public.gmane.org \
    --cc=akepner-sJ/iWh9BUns@public.gmane.org \
    --cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox