[Patch] plug async scan race at 1st node scan

linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [Patch] plug async scan race at 1st node scan
@ 2007-08-20 13:10 James Smart
  2007-08-20 13:49 ` Matthew Wilcox
  0 siblings, 1 reply; 6+ messages in thread
From: James Smart @ 2007-08-20 13:10 UTC (permalink / raw)
  To: linux-scsi; +Cc: tore

In testing 2.6.23-rc3, there is a small window where the async-per-target
scan of the transport can beat the call from the LLDD to scsi_scan_host().
If so, the target scan starts, and can add the sdev (and the sysfslinks) before
the flags for async scan can prevent it. Thus, when async scan finishes and
asks for all sdevs to be enumerated -EEXIST errors will pop up.

This patch has scsi_alloc_host() look for the async scan hooks, and if they
exist, sets the async_scan flag to a pre-state, which still allows the async
target scan to continue, but stops the sysfs enumeration.


-- james s

This patch cut against 2.6.23-rc3 plus the following patch:
   http://marc.info/?l=linux-scsi&m=118289275414202&w=2

PS: there really should be better hooks for knowing if the driver expects
  async or background scanning (perhaps the whole pre-state should be set
  by the driver).

Signed-off-by: James Smart <James.Smart@emulex.com>



diff -upNr a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
--- a/drivers/scsi/hosts.c	2007-08-17 11:49:47.000000000 -0400
+++ b/drivers/scsi/hosts.c	2007-08-19 12:23:06.000000000 -0400
@@ -343,6 +343,12 @@ struct Scsi_Host *scsi_host_alloc(struct
 	shost->use_clustering = sht->use_clustering;
 	shost->ordered_tag = sht->ordered_tag;
 
+	if ((strncmp(scsi_scan_type, "async", 5) == 0) &&
+	    (shost->hostt->scan_finished))
+		shost->async_scan = ASYNC_SCAN_PREASYNC;
+	else
+		shost->async_scan = ASYNC_SCAN_CMPLT;
+
 	if (sht->max_host_blocked)
 		shost->max_host_blocked = sht->max_host_blocked;
 	else
diff -upNr a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
--- a/drivers/scsi/scsi_priv.h	2007-07-08 19:32:17.000000000 -0400
+++ b/drivers/scsi/scsi_priv.h	2007-08-19 12:23:49.000000000 -0400
@@ -38,9 +38,6 @@ static inline void scsi_log_completion(s
 	{ };
 #endif
 
-/* scsi_scan.c */
-int scsi_complete_async_scans(void);
-
 /* scsi_devinfo.c */
 extern int scsi_get_device_flags(struct scsi_device *sdev,
 				 const unsigned char *vendor,
@@ -96,6 +93,8 @@ extern int scsi_scan_host_selected(struc
 				   unsigned int, unsigned int, int);
 extern void scsi_forget_host(struct Scsi_Host *);
 extern void scsi_rescan_device(struct device *);
+extern char scsi_scan_type[];
+int scsi_complete_async_scans(void);
 
 /* scsi_sysctl.c */
 #ifdef CONFIG_SYSCTL
diff -upNr a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
--- a/drivers/scsi/scsi_scan.c	2007-08-19 12:21:10.000000000 -0400
+++ b/drivers/scsi/scsi_scan.c	2007-08-19 12:33:06.000000000 -0400
@@ -95,7 +95,7 @@ MODULE_PARM_DESC(max_luns,
 #define SCSI_SCAN_TYPE_DEFAULT "sync"
 #endif
 
-static char scsi_scan_type[6] = SCSI_SCAN_TYPE_DEFAULT;
+char scsi_scan_type[6] = SCSI_SCAN_TYPE_DEFAULT;
 
 module_param_string(scan, scsi_scan_type, sizeof(scsi_scan_type), S_IRUGO);
 MODULE_PARM_DESC(scan, "sync, async or none");
@@ -908,7 +908,7 @@ static int scsi_add_lun(struct scsi_devi
 	 * register it and tell the rest of the kernel
 	 * about it.
 	 */
-	if (!async && scsi_sysfs_add_sdev(sdev) != 0)
+	if ((async == ASYNC_SCAN_CMPLT) && scsi_sysfs_add_sdev(sdev) != 0)
 		return SCSI_SCAN_NO_RESPONSE;
 
 	return SCSI_SCAN_LUN_PRESENT;
@@ -1472,7 +1472,7 @@ struct scsi_device *__scsi_add_device(st
 		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
+	if (shost->async_scan == ASYNC_SCAN_CMPLT)
 		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost))
@@ -1588,7 +1588,7 @@ void scsi_scan_target(struct device *par
 		return;
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
+	if (shost->async_scan == ASYNC_SCAN_CMPLT)
 		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost))
@@ -1641,7 +1641,7 @@ int scsi_scan_host_selected(struct Scsi_
 		return -EINVAL;
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
+	if (shost->async_scan == ASYNC_SCAN_CMPLT)
 		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost)) {
@@ -1686,7 +1686,7 @@ static struct async_scan_data *scsi_prep
 	if (strncmp(scsi_scan_type, "sync", 4) == 0)
 		return NULL;
 
-	if (shost->async_scan) {
+	if (shost->async_scan == ASYNC_SCAN_RUNNING) {
 		printk("%s called twice for host %d", __FUNCTION__,
 				shost->host_no);
 		dump_stack();
@@ -1703,7 +1703,7 @@ static struct async_scan_data *scsi_prep
 
 	mutex_lock(&shost->scan_mutex);
 	spin_lock_irqsave(shost->host_lock, flags);
-	shost->async_scan = 1;
+	shost->async_scan = ASYNC_SCAN_RUNNING;
 	spin_unlock_irqrestore(shost->host_lock, flags);
 	mutex_unlock(&shost->scan_mutex);
 
@@ -1740,9 +1740,13 @@ static void scsi_finish_async_scan(struc
 
 	mutex_lock(&shost->scan_mutex);
 
-	if (!shost->async_scan) {
-		printk("%s called twice for host %d", __FUNCTION__,
+	if (shost->async_scan != ASYNC_SCAN_RUNNING) {
+		if (shost->async_scan == ASYNC_SCAN_CMPLT)
+			printk("%s called twice for host %d", __FUNCTION__,
 				shost->host_no);
+		else /* shost->async_scan == ASYNC_SCAN_PREASYNC */
+			printk("%s called prior to async scan %d", 
+				__FUNCTION__, shost->host_no);
 		dump_stack();
 		return;
 	}
@@ -1752,7 +1756,7 @@ static void scsi_finish_async_scan(struc
 	scsi_sysfs_add_devices(shost);
 
 	spin_lock_irqsave(shost->host_lock, flags);
-	shost->async_scan = 0;
+	shost->async_scan = ASYNC_SCAN_CMPLT;
 	spin_unlock_irqrestore(shost->host_lock, flags);
 
 	mutex_unlock(&shost->scan_mutex);
diff -upNr a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
--- a/include/scsi/scsi_host.h	2007-08-17 11:49:56.000000000 -0400
+++ b/include/scsi/scsi_host.h	2007-08-19 12:24:28.000000000 -0400
@@ -603,7 +603,10 @@ struct Scsi_Host {
 	unsigned tmf_in_progress:1;
 
 	/* Asynchronous scan in progress */
-	unsigned async_scan:1;
+	unsigned async_scan:2;
+#define ASYNC_SCAN_CMPLT	0
+#define ASYNC_SCAN_RUNNING	1
+#define ASYNC_SCAN_PREASYNC	2
 
 	/*
 	 * Optional work queue to be utilized by the transport



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] plug async scan race at 1st node scan
  2007-08-20 13:10 [Patch] plug async scan race at 1st node scan James Smart
@ 2007-08-20 13:49 ` Matthew Wilcox
  2007-08-20 14:02   ` James Smart
  2008-03-01 11:44   ` Mike Christie
  0 siblings, 2 replies; 6+ messages in thread
From: Matthew Wilcox @ 2007-08-20 13:49 UTC (permalink / raw)
  To: James Smart; +Cc: linux-scsi, tore

On Mon, Aug 20, 2007 at 09:10:35AM -0400, James Smart wrote:
> In testing 2.6.23-rc3, there is a small window where the async-per-target
> scan of the transport can beat the call from the LLDD to scsi_scan_host().

I'd assumed that events wouldn't come in until ->scan_start was called.
I see lpfc doesn't have one; is it possible to restructure it to have one?

(In any case, good job tracking this down; it was really annoying me.)

Possibly we should be less forgiving, and require drivers to have a
scan_start, otherwise they can't avoid this race.

-- 
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] plug async scan race at 1st node scan
  2007-08-20 13:49 ` Matthew Wilcox
@ 2007-08-20 14:02   ` James Smart
  2007-08-20 14:32     ` Matthew Wilcox
  2008-03-01 11:44   ` Mike Christie
  1 sibling, 1 reply; 6+ messages in thread
From: James Smart @ 2007-08-20 14:02 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-scsi, tore

Well - depends on what the semantics of scan_start are.... to date, there
really are none....  and what does requiring it mean ?

What's implied is that you want "bring up link" to be enabled in scan_start().
Doable, but the code paths weren't put together expecting this, so it may be
a bit of work. I'll have to look at it.   Also, you're asking me to fix one
driver, without thinking about the structure in others.... I'd rather the
api itself locked down state/behavior, not simply the LLD coding.

Before starting, I'd rather we setting on what the semantics of the api is.
To the uninitiated, requiring a driver to call scsi_scan_host(), when the
transport drives all discovery, and where the scan_host really has nothing
to do with scanning, but rather creates a firewall delay to hold off serializing
of device enumerations - is all very confusing.  I'd rather we had a different
entry point for those things that supply start/finish routines and it was
named more in line with "start discovery delay".

Adding the notion of scan_start bringing up the link sounds reasonable. However,
how does this translate to other bus types ?

-- james s

Matthew Wilcox wrote:
> On Mon, Aug 20, 2007 at 09:10:35AM -0400, James Smart wrote:
>> In testing 2.6.23-rc3, there is a small window where the async-per-target
>> scan of the transport can beat the call from the LLDD to scsi_scan_host().
> 
> I'd assumed that events wouldn't come in until ->scan_start was called.
> I see lpfc doesn't have one; is it possible to restructure it to have one?
> 
> (In any case, good job tracking this down; it was really annoying me.)
> 
> Possibly we should be less forgiving, and require drivers to have a
> scan_start, otherwise they can't avoid this race.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] plug async scan race at 1st node scan
  2007-08-20 14:02   ` James Smart
@ 2007-08-20 14:32     ` Matthew Wilcox
  0 siblings, 0 replies; 6+ messages in thread
From: Matthew Wilcox @ 2007-08-20 14:32 UTC (permalink / raw)
  To: James Smart; +Cc: linux-scsi, tore

On Mon, Aug 20, 2007 at 10:02:45AM -0400, James Smart wrote:
> Well - depends on what the semantics of scan_start are.... to date, there
> really are none....  and what does requiring it mean ?

OK, and to make this discussion exceptionally useful, all comments are
to be made in the context of modifications to the documentation:

        /*
         * If the host wants to be called before the scan starts, but
         * after the midlayer has set up ready for the scan, it can fill
         * in this function.
         */

When I say "require it", I mean that currently we have the code:

        if (shost->hostt->scan_finished) {
                unsigned long start = jiffies;
                if (shost->hostt->scan_start)
                        shost->hostt->scan_start(shost);

I propose removing that second 'if':

        if (shost->hostt->scan_finished) {
                unsigned long start = jiffies;
		shost->hostt->scan_start(shost);

And here's my proposal for new documentation for scan_start():

	/*
	 * If the host fills in scan_finished, above, it must also fill in
	 * scan_start.  scan_start will be called by the midlayer to inform
	 * the host that it is now ready to receive requests to scan targets.
	 */

> What's implied is that you want "bring up link" to be enabled in 
> scan_start().

That's certainly one obvious way to do it.  Another way would be to have
a flag in the driver blocking calls to the midlayer ... I think you have
that already?

> Doable, but the code paths weren't put together expecting this, so it may be
> a bit of work. I'll have to look at it.   Also, you're asking me to fix one
> driver, without thinking about the structure in others.... I'd rather the
> api itself locked down state/behavior, not simply the LLD coding.

I think other drivers already do this.  We only have three drivers
currently using this API -- lpfc, qla2xxx and aic94xx.  asd_scan_start()
enables the PHYs.  qla2xxx_scan_start sets some bits, but I'm not quite
sure of their effect.

> Before starting, I'd rather we setting on what the semantics of the api is.
> To the uninitiated, requiring a driver to call scsi_scan_host(), when the
> transport drives all discovery, and where the scan_host really has nothing
> to do with scanning, but rather creates a firewall delay to hold off 
> serializing
> of device enumerations - is all very confusing.  I'd rather we had a 
> different
> entry point for those things that supply start/finish routines and it was
> named more in line with "start discovery delay".

The name certainly isn't great, but I thought we agreed that having a
common entry point for all SCSI hosts was a good idea.

> Adding the notion of scan_start bringing up the link sounds reasonable. 
> However,
> how does this translate to other bus types ?

It works for SAS and FC ... I don't see why it wouldn't work for other
bus types.  Obviously, we don't call it for SPI.

> -- james s
> 
> Matthew Wilcox wrote:
> >On Mon, Aug 20, 2007 at 09:10:35AM -0400, James Smart wrote:
> >>In testing 2.6.23-rc3, there is a small window where the async-per-target
> >>scan of the transport can beat the call from the LLDD to scsi_scan_host().
> >
> >I'd assumed that events wouldn't come in until ->scan_start was called.
> >I see lpfc doesn't have one; is it possible to restructure it to have one?
> >
> >(In any case, good job tracking this down; it was really annoying me.)
> >
> >Possibly we should be less forgiving, and require drivers to have a
> >scan_start, otherwise they can't avoid this race.
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] plug async scan race at 1st node scan
  2007-08-20 13:49 ` Matthew Wilcox
  2007-08-20 14:02   ` James Smart
@ 2008-03-01 11:44   ` Mike Christie
  2008-03-01 14:26     ` Matthew Wilcox
  1 sibling, 1 reply; 6+ messages in thread
From: Mike Christie @ 2008-03-01 11:44 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: James Smart, linux-scsi, tore

Matthew Wilcox wrote:
> On Mon, Aug 20, 2007 at 09:10:35AM -0400, James Smart wrote:
>> In testing 2.6.23-rc3, there is a small window where the async-per-target
>> scan of the transport can beat the call from the LLDD to scsi_scan_host().
> 
> I'd assumed that events wouldn't come in until ->scan_start was called.
> I see lpfc doesn't have one; is it possible to restructure it to have one?
> 
> (In any case, good job tracking this down; it was really annoying me.)
> 
> Possibly we should be less forgiving, and require drivers to have a
> scan_start, otherwise they can't avoid this race.
> 

I am hitting this problem with qla4xxx (added async scanning to 
2.6.25-rc). Was the final decision that we have to add a scan_start? I 
did not see scan_start be made mandatory and I did not see James's patch 
go in.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [Patch] plug async scan race at 1st node scan
  2008-03-01 11:44   ` Mike Christie
@ 2008-03-01 14:26     ` Matthew Wilcox
  0 siblings, 0 replies; 6+ messages in thread
From: Matthew Wilcox @ 2008-03-01 14:26 UTC (permalink / raw)
  To: Mike Christie; +Cc: James Smart, linux-scsi, tore

On Sat, Mar 01, 2008 at 05:44:29AM -0600, Mike Christie wrote:
> >Possibly we should be less forgiving, and require drivers to have a
> >scan_start, otherwise they can't avoid this race.
> 
> I am hitting this problem with qla4xxx (added async scanning to 
> 2.6.25-rc). Was the final decision that we have to add a scan_start? I 
> did not see scan_start be made mandatory and I did not see James's patch 
> go in.

You can't really make it mandatory ... it's only mandatory *if* you have
a scan_finished.

The documentation should be updated though.  Possibly the function
prototype should be changed from returning void to returning int, to
allow drivers to say "Oh no, I can't start after all".

-- 
Intel are signing my paycheques ... these opinions are still mine
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-03-01 14:27 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-20 13:10 [Patch] plug async scan race at 1st node scan James Smart
2007-08-20 13:49 ` Matthew Wilcox
2007-08-20 14:02   ` James Smart
2007-08-20 14:32     ` Matthew Wilcox
2008-03-01 11:44   ` Mike Christie
2008-03-01 14:26     ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).