From: Sasha Khapyorsky <sashak-smomgflXvOZWk0Htik3J/w@public.gmane.org>
To: Arthur Kepner <akepner-sJ/iWh9BUns@public.gmane.org>
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
Dale.R.Talcott-NSQ8wuThN14@public.gmane.org
Subject: Re: [PATCH/RFC] opensm: toggle sweeping V2
Date: Sat, 22 May 2010 20:04:31 +0300 [thread overview]
Message-ID: <20100522170431.GU28549@me> (raw)
In-Reply-To: <20100519235727.GP7678-sJ/iWh9BUns@public.gmane.org>
On 16:57 Wed 19 May , Arthur Kepner wrote:
>
> One of our customers recently merged some new systems into a
> large, existing cluster. They requested a mechanism to prevent
> opensm from sweeping while the new equipment was being added to
> the IB fabric, and then resume sweeping once they felt confident
> that the newly added (sub)fabric was correctly cabled, and fully
> functional. They used something similar to the following patch.
>
> Comments?
I still not understand what is wrong with running OpenSM with sweep
disabled and restarting when a fabric is ready. But anyway a new
console command looks less aggressive for me than signaling... :)
> Signed-off-by: Arthur Kepner <akepner-sJ/iWh9BUns@public.gmane.org>
The questions about patch is below.
>
> ---
>
> include/opensm/osm_subnet.h | 6 ++++++
> opensm/osm_console.c | 32 ++++++++++++++++++++++++++++++++
> opensm/osm_state_mgr.c | 8 +++++++-
> opensm/osm_subnet.c | 1 +
> opensm/osm_trap_rcv.c | 35 +++++++++++++++++++++--------------
> 5 files changed, 67 insertions(+), 15 deletions(-)
>
> diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h
> index d79ed8f..2a1db99 100644
> --- a/opensm/include/opensm/osm_subnet.h
> +++ b/opensm/include/opensm/osm_subnet.h
> @@ -532,6 +532,7 @@ typedef struct osm_subn {
> boolean_t in_sweep_hop_0;
> boolean_t first_time_master_sweep;
> boolean_t coming_out_of_standby;
> + boolean_t sweeping_enabled;
> unsigned need_update;
> cl_fmap_t mgrp_mgid_tbl;
> void *mboxes[IB_LID_MCAST_END_HO - IB_LID_MCAST_START_HO + 1];
> @@ -651,6 +652,11 @@ typedef struct osm_subn {
> * The flag is set true if the SM state was standby and now
> * changed to MASTER it is reset at the end of the sweep.
> *
> +* sweeping_enabled
> +* FALSE - sweeping is administratively disabled, all
> +* sweeping is inhibited, TRUE - sweeping is done
> +* normally
> +*
> * need_update
> * This flag should be on during first non-master heavy
> * (including pre-master discovery stage)
> diff --git a/opensm/opensm/osm_console.c b/opensm/opensm/osm_console.c
> index 968486e..bc7bea3 100644
> --- a/opensm/opensm/osm_console.c
> +++ b/opensm/opensm/osm_console.c
> @@ -150,6 +150,16 @@ static void help_reroute(FILE * out, int detail)
> }
> }
>
> +static void help_sweep(FILE * out, int detail)
> +{
> + fprintf(out, "sweep [on|off]\n");
> + if (detail) {
> + fprintf(out, "enable or disable sweeping\n");
> + fprintf(out, " [on] sweep normally\n");
> + fprintf(out, " [off] inhibit all sweeping\n");
> + }
> +}
> +
> static void help_status(FILE * out, int detail)
> {
> fprintf(out, "status [loop]\n");
> @@ -427,11 +437,15 @@ static void print_status(osm_opensm_t * p_osm, FILE * out)
> p_osm->stats.sa_mads_ignored);
> fprintf(out, "\n Subnet flags\n"
> " ------------\n"
> + " Sweeping enabled : %d\n"
> + " Sweep interval (seconds) : %d\n"
> " Ignore existing lfts : %d\n"
> " Subnet Init errors : %d\n"
> " In sweep hop 0 : %d\n"
> " First time master sweep : %d\n"
> " Coming out of standby : %d\n",
> + p_osm->subn.sweeping_enabled,
> + p_osm->subn.opt.sweep_interval,
> p_osm->subn.ignore_existing_lfts,
> p_osm->subn.subnet_initialization_error,
> p_osm->subn.in_sweep_hop_0,
> @@ -495,6 +509,23 @@ static void reroute_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
> osm_opensm_sweep(p_osm);
> }
>
> +static void sweep_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
> +{
> + char *p_cmd;
> +
> + p_cmd = next_token(p_last);
> + if (!p_cmd ||
> + (strcmp(p_cmd, "on") != 0 && strcmp(p_cmd, "off") != 0)) {
> + fprintf(out, "Invalid sweep command\n");
> + help_sweep(out, 1);
> + } else {
> + if (strcmp(p_cmd, "on") == 0)
> + p_osm->subn.sweeping_enabled = TRUE;
> + else
> + p_osm->subn.sweeping_enabled = FALSE;
> + }
> +}
> +
> static void logflush_parse(char **p_last, osm_opensm_t * p_osm, FILE * out)
> {
> fflush(p_osm->log.out_port);
> @@ -1332,6 +1363,7 @@ static const struct command console_cmds[] = {
> {"priority", &help_priority, &priority_parse},
> {"resweep", &help_resweep, &resweep_parse},
> {"reroute", &help_reroute, &reroute_parse},
> + {"sweep", &help_sweep, &sweep_parse},
> {"status", &help_status, &status_parse},
> {"logflush", &help_logflush, &logflush_parse},
> {"querylid", &help_querylid, &querylid_parse},
> diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> index e43463f..81c8f54 100644
> --- a/opensm/opensm/osm_state_mgr.c
> +++ b/opensm/opensm/osm_state_mgr.c
> @@ -1415,7 +1415,13 @@ void osm_state_mgr_process(IN osm_sm_t * sm, IN osm_signal_t signal)
>
> switch (signal) {
> case OSM_SIGNAL_SWEEP:
> - do_sweep(sm);
> + if (!sm->p_subn->sweeping_enabled) {
> + OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "sweeping disabled - "
> + "ignoring signal %s in state %s\n",
> + osm_get_sm_signal_str(signal),
> + osm_get_sm_mgr_state_str(sm->p_subn->sm_state));
> + } else
> + do_sweep(sm);
> break;
> case OSM_SIGNAL_IDLE_TIME_PROCESS_REQUEST:
> do_process_mgrp_queue(sm);
> diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
> index ac8cb37..ba2c812 100644
> --- a/opensm/opensm/osm_subnet.c
> +++ b/opensm/opensm/osm_subnet.c
> @@ -531,6 +531,7 @@ ib_api_status_t osm_subn_init(IN osm_subn_t * p_subn, IN osm_opensm_t * p_osm,
>
> /* we assume master by default - so we only need to set it true if STANDBY */
> p_subn->coming_out_of_standby = FALSE;
> + p_subn->sweeping_enabled = TRUE;
>
> return IB_SUCCESS;
> }
> diff --git a/opensm/opensm/osm_trap_rcv.c b/opensm/opensm/osm_trap_rcv.c
> index bf13239..ba366a9 100644
> --- a/opensm/opensm/osm_trap_rcv.c
> +++ b/opensm/opensm/osm_trap_rcv.c
> @@ -515,23 +515,30 @@ static void trap_rcv_process_request(IN osm_sm_t * sm,
> check_sweep:
> /* do a sweep if we received a trap */
> if (sm->p_subn->opt.sweep_on_trap) {
> - /* if this is trap number 128 or run_heavy_sweep is TRUE -
> - update the force_heavy_sweep flag of the subnet.
> - Sweep also on traps 144 - these traps signal a change of
> - certain port capabilities.
> - TODO: In the future this can be changed to just getting
> - PortInfo on this port instead of sweeping the entire subnet. */
> - if (ib_notice_is_generic(p_ntci) &&
> - (cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 128 ||
> - cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 144 ||
> - run_heavy_sweep)) {
> - OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
> - "Forcing heavy sweep. Received trap:%u\n",
> + if (!sm->p_subn->sweeping_enabled) {
> + OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> + "sweeping disabled - ignoring trap %u\n",
> cl_ntoh16(p_ntci->g_or_v.generic.trap_num));
Isn't this case already handled in osm_state_mgr_process() and this code
addition in osm_trap_rcv.c redundant?
And if it is not. Wouldn't it be simpler to check:
if (sm->p_subn->opt.sweep_on_trap && sm->p_subn->sweeping_enabled) {
in order to minimize the change set?
> + } else {
> + /* if this is trap number 128 or run_heavy_sweep is
> + TRUE - update the force_heavy_sweep flag of the
> + subnet. Sweep also on traps 144 - these traps
> + signal a change of certain port capabilities.
> + TODO: In the future this can be changed to just
> + getting PortInfo on this port instead of sweeping
> + the entire subnet. */
> + if (ib_notice_is_generic(p_ntci) &&
> + (cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 128 ||
> + cl_ntoh16(p_ntci->g_or_v.generic.trap_num) == 144 ||
> + run_heavy_sweep)) {
> + OSM_LOG(sm->p_log, OSM_LOG_VERBOSE,
> + "Forcing heavy sweep. Received trap:%u\n",
> + cl_ntoh16(p_ntci->g_or_v.generic.trap_num));
>
> - sm->p_subn->force_heavy_sweep = TRUE;
> + sm->p_subn->force_heavy_sweep = TRUE;
> + }
> + osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
> }
> - osm_sm_signal(sm, OSM_SIGNAL_SWEEP);
> }
>
> /* If we reached here due to trap 129/130/131 - do not need to do
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
next prev parent reply other threads:[~2010-05-22 17:04 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2010-05-19 23:57 [PATCH/RFC] opensm: toggle sweeping V2 Arthur Kepner
[not found] ` <20100519235727.GP7678-sJ/iWh9BUns@public.gmane.org>
2010-05-22 17:04 ` Sasha Khapyorsky [this message]
2010-05-24 21:18 ` Arthur Kepner
[not found] ` <20100524211830.GJ2678-sJ/iWh9BUns@public.gmane.org>
2010-05-25 15:14 ` Sasha Khapyorsky
2010-05-25 16:50 ` Arthur Kepner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20100522170431.GU28549@me \
--to=sashak-smomgflxvozwk0htik3j/w@public.gmane.org \
--cc=Dale.R.Talcott-NSQ8wuThN14@public.gmane.org \
--cc=akepner-sJ/iWh9BUns@public.gmane.org \
--cc=linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox