linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Convert scsi_scan to use generic async mechanism
@ 2009-04-28 19:35 Matthew Wilcox
  2009-04-29 14:39 ` Matthew Wilcox
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Matthew Wilcox @ 2009-04-28 19:35 UTC (permalink / raw)
  To: linux-scsi; +Cc: Arjan van de Ven


The new generic async scanning infrastructure is a perfect replacement
for the scsi async scanning code.  We do need to use a separate domain
as libata drivers will deadlock waiting for themselves to complete if
we don't.  Tested with 515 LUNs (3 on AHCI, two fibre channel cards,
each with two targets, each with 128 LUNs).

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 6f51ca4..515e7f9 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -122,15 +122,7 @@ MODULE_PARM_DESC(inq_timeout,
 		 "Timeout (in seconds) waiting for devices to answer INQUIRY."
 		 " Default is 5. Some non-compliant devices need more.");
 
-/* This lock protects only this list */
-static DEFINE_SPINLOCK(async_scan_lock);
-static LIST_HEAD(scanning_hosts);
-
-struct async_scan_data {
-	struct list_head list;
-	struct Scsi_Host *shost;
-	struct completion prev_finished;
-};
+static LIST_HEAD(scan_domain);
 
 /**
  * scsi_complete_async_scans - Wait for asynchronous scans to complete
@@ -142,44 +134,7 @@ struct async_scan_data {
  */
 int scsi_complete_async_scans(void)
 {
-	struct async_scan_data *data;
-
-	do {
-		if (list_empty(&scanning_hosts))
-			return 0;
-		/* If we can't get memory immediately, that's OK.  Just
-		 * sleep a little.  Even if we never get memory, the async
-		 * scans will finish eventually.
-		 */
-		data = kmalloc(sizeof(*data), GFP_KERNEL);
-		if (!data)
-			msleep(1);
-	} while (!data);
-
-	data->shost = NULL;
-	init_completion(&data->prev_finished);
-
-	spin_lock(&async_scan_lock);
-	/* Check that there's still somebody else on the list */
-	if (list_empty(&scanning_hosts))
-		goto done;
-	list_add_tail(&data->list, &scanning_hosts);
-	spin_unlock(&async_scan_lock);
-
-	printk(KERN_INFO "scsi: waiting for bus probes to complete ...\n");
-	wait_for_completion(&data->prev_finished);
-
-	spin_lock(&async_scan_lock);
-	list_del(&data->list);
-	if (!list_empty(&scanning_hosts)) {
-		struct async_scan_data *next = list_entry(scanning_hosts.next,
-				struct async_scan_data, list);
-		complete(&next->prev_finished);
-	}
- done:
-	spin_unlock(&async_scan_lock);
-
-	kfree(data);
+	async_synchronize_full_domain(&scan_domain);
 	return 0;
 }
 
@@ -1711,88 +1666,31 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost)
 	}
 }
 
-/**
- * scsi_prep_async_scan - prepare for an async scan
- * @shost: the host which will be scanned
- * Returns: a cookie to be passed to scsi_finish_async_scan()
- *
- * Tells the midlayer this host is going to do an asynchronous scan.
- * It reserves the host's position in the scanning list and ensures
- * that other asynchronous scans started after this one won't affect the
- * ordering of the discovered devices.
- */
-static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost)
+static void async_scsi_scan_host(void *data, async_cookie_t cookie)
 {
-	struct async_scan_data *data;
+	struct Scsi_Host *shost = data;
 	unsigned long flags;
 
-	if (strncmp(scsi_scan_type, "sync", 4) == 0)
-		return NULL;
-
-	if (shost->async_scan) {
-		printk("%s called twice for host %d", __func__,
-				shost->host_no);
-		dump_stack();
-		return NULL;
-	}
-
-	data = kmalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		goto err;
-	data->shost = scsi_host_get(shost);
-	if (!data->shost)
-		goto err;
-	init_completion(&data->prev_finished);
-
-	mutex_lock(&shost->scan_mutex);
 	spin_lock_irqsave(shost->host_lock, flags);
 	shost->async_scan = 1;
 	spin_unlock_irqrestore(shost->host_lock, flags);
-	mutex_unlock(&shost->scan_mutex);
-
-	spin_lock(&async_scan_lock);
-	if (list_empty(&scanning_hosts))
-		complete(&data->prev_finished);
-	list_add_tail(&data->list, &scanning_hosts);
-	spin_unlock(&async_scan_lock);
-
-	return data;
 
- err:
-	kfree(data);
-	return NULL;
-}
-
-/**
- * scsi_finish_async_scan - asynchronous scan has finished
- * @data: cookie returned from earlier call to scsi_prep_async_scan()
- *
- * All the devices currently attached to this host have been found.
- * This function announces all the devices it has found to the rest
- * of the system.
- */
-static void scsi_finish_async_scan(struct async_scan_data *data)
-{
-	struct Scsi_Host *shost;
-	unsigned long flags;
+	if (shost->hostt->scan_finished) {
+		unsigned long start = jiffies;
+		if (shost->hostt->scan_start)
+			shost->hostt->scan_start(shost);
 
-	if (!data)
-		return;
+		while (!shost->hostt->scan_finished(shost, jiffies - start))
+			msleep(10);
+	} else {
+		scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
+				SCAN_WILD_CARD, 0);
+	}
 
-	shost = data->shost;
+	async_synchronize_cookie_domain(cookie, &scan_domain);
 
 	mutex_lock(&shost->scan_mutex);
 
-	if (!shost->async_scan) {
-		printk("%s called twice for host %d", __func__,
-				shost->host_no);
-		dump_stack();
-		mutex_unlock(&shost->scan_mutex);
-		return;
-	}
-
-	wait_for_completion(&data->prev_finished);
-
 	scsi_sysfs_add_devices(shost);
 
 	spin_lock_irqsave(shost->host_lock, flags);
@@ -1801,40 +1699,7 @@ static void scsi_finish_async_scan(struct async_scan_data *data)
 
 	mutex_unlock(&shost->scan_mutex);
 
-	spin_lock(&async_scan_lock);
-	list_del(&data->list);
-	if (!list_empty(&scanning_hosts)) {
-		struct async_scan_data *next = list_entry(scanning_hosts.next,
-				struct async_scan_data, list);
-		complete(&next->prev_finished);
-	}
-	spin_unlock(&async_scan_lock);
-
 	scsi_host_put(shost);
-	kfree(data);
-}
-
-static void do_scsi_scan_host(struct Scsi_Host *shost)
-{
-	if (shost->hostt->scan_finished) {
-		unsigned long start = jiffies;
-		if (shost->hostt->scan_start)
-			shost->hostt->scan_start(shost);
-
-		while (!shost->hostt->scan_finished(shost, jiffies - start))
-			msleep(10);
-	} else {
-		scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
-				SCAN_WILD_CARD, 0);
-	}
-}
-
-static int do_scan_async(void *_data)
-{
-	struct async_scan_data *data = _data;
-	do_scsi_scan_host(data->shost);
-	scsi_finish_async_scan(data);
-	return 0;
 }
 
 /**
@@ -1843,21 +1708,16 @@ static int do_scan_async(void *_data)
  **/
 void scsi_scan_host(struct Scsi_Host *shost)
 {
-	struct task_struct *p;
-	struct async_scan_data *data;
+	async_cookie_t cookie;
 
 	if (strncmp(scsi_scan_type, "none", 4) == 0)
 		return;
 
-	data = scsi_prep_async_scan(shost);
-	if (!data) {
-		do_scsi_scan_host(shost);
-		return;
-	}
-
-	p = kthread_run(do_scan_async, data, "scsi_scan_%d", shost->host_no);
-	if (IS_ERR(p))
-		do_scan_async(data);
+	scsi_host_get(shost);
+	cookie = async_schedule_domain(async_scsi_scan_host, shost,
+								&scan_domain);
+	if (strncmp(scsi_scan_type, "sync", 4) == 0)
+		async_synchronize_cookie_domain(cookie, &scan_domain);
 }
 EXPORT_SYMBOL(scsi_scan_host);
 

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-04-28 19:35 [PATCH] Convert scsi_scan to use generic async mechanism Matthew Wilcox
@ 2009-04-29 14:39 ` Matthew Wilcox
  2009-05-23 16:21 ` James Bottomley
  2009-05-23 20:39 ` James Bottomley
  2 siblings, 0 replies; 7+ messages in thread
From: Matthew Wilcox @ 2009-04-29 14:39 UTC (permalink / raw)
  To: linux-scsi

On Tue, Apr 28, 2009 at 01:35:58PM -0600, Matthew Wilcox wrote:
> The new generic async scanning infrastructure is a perfect replacement
> for the scsi async scanning code.  We do need to use a separate domain
> as libata drivers will deadlock waiting for themselves to complete if
> we don't.  Tested with 515 LUNs (3 on AHCI, two fibre channel cards,
> each with two targets, each with 128 LUNs).

I don't think this patch makes the situation any _worse_ than the
current code, but I wonder if the scan_mutex should be held over the
call to scsi_scan_host_selected().  Any thoughts?

> Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
> 
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 6f51ca4..515e7f9 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -122,15 +122,7 @@ MODULE_PARM_DESC(inq_timeout,
>  		 "Timeout (in seconds) waiting for devices to answer INQUIRY."
>  		 " Default is 5. Some non-compliant devices need more.");
>  
> -/* This lock protects only this list */
> -static DEFINE_SPINLOCK(async_scan_lock);
> -static LIST_HEAD(scanning_hosts);
> -
> -struct async_scan_data {
> -	struct list_head list;
> -	struct Scsi_Host *shost;
> -	struct completion prev_finished;
> -};
> +static LIST_HEAD(scan_domain);
>  
>  /**
>   * scsi_complete_async_scans - Wait for asynchronous scans to complete
> @@ -142,44 +134,7 @@ struct async_scan_data {
>   */
>  int scsi_complete_async_scans(void)
>  {
> -	struct async_scan_data *data;
> -
> -	do {
> -		if (list_empty(&scanning_hosts))
> -			return 0;
> -		/* If we can't get memory immediately, that's OK.  Just
> -		 * sleep a little.  Even if we never get memory, the async
> -		 * scans will finish eventually.
> -		 */
> -		data = kmalloc(sizeof(*data), GFP_KERNEL);
> -		if (!data)
> -			msleep(1);
> -	} while (!data);
> -
> -	data->shost = NULL;
> -	init_completion(&data->prev_finished);
> -
> -	spin_lock(&async_scan_lock);
> -	/* Check that there's still somebody else on the list */
> -	if (list_empty(&scanning_hosts))
> -		goto done;
> -	list_add_tail(&data->list, &scanning_hosts);
> -	spin_unlock(&async_scan_lock);
> -
> -	printk(KERN_INFO "scsi: waiting for bus probes to complete ...\n");
> -	wait_for_completion(&data->prev_finished);
> -
> -	spin_lock(&async_scan_lock);
> -	list_del(&data->list);
> -	if (!list_empty(&scanning_hosts)) {
> -		struct async_scan_data *next = list_entry(scanning_hosts.next,
> -				struct async_scan_data, list);
> -		complete(&next->prev_finished);
> -	}
> - done:
> -	spin_unlock(&async_scan_lock);
> -
> -	kfree(data);
> +	async_synchronize_full_domain(&scan_domain);
>  	return 0;
>  }
>  
> @@ -1711,88 +1666,31 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost)
>  	}
>  }
>  
> -/**
> - * scsi_prep_async_scan - prepare for an async scan
> - * @shost: the host which will be scanned
> - * Returns: a cookie to be passed to scsi_finish_async_scan()
> - *
> - * Tells the midlayer this host is going to do an asynchronous scan.
> - * It reserves the host's position in the scanning list and ensures
> - * that other asynchronous scans started after this one won't affect the
> - * ordering of the discovered devices.
> - */
> -static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost)
> +static void async_scsi_scan_host(void *data, async_cookie_t cookie)
>  {
> -	struct async_scan_data *data;
> +	struct Scsi_Host *shost = data;
>  	unsigned long flags;
>  
> -	if (strncmp(scsi_scan_type, "sync", 4) == 0)
> -		return NULL;
> -
> -	if (shost->async_scan) {
> -		printk("%s called twice for host %d", __func__,
> -				shost->host_no);
> -		dump_stack();
> -		return NULL;
> -	}
> -
> -	data = kmalloc(sizeof(*data), GFP_KERNEL);
> -	if (!data)
> -		goto err;
> -	data->shost = scsi_host_get(shost);
> -	if (!data->shost)
> -		goto err;
> -	init_completion(&data->prev_finished);
> -
> -	mutex_lock(&shost->scan_mutex);
>  	spin_lock_irqsave(shost->host_lock, flags);
>  	shost->async_scan = 1;
>  	spin_unlock_irqrestore(shost->host_lock, flags);
> -	mutex_unlock(&shost->scan_mutex);
> -
> -	spin_lock(&async_scan_lock);
> -	if (list_empty(&scanning_hosts))
> -		complete(&data->prev_finished);
> -	list_add_tail(&data->list, &scanning_hosts);
> -	spin_unlock(&async_scan_lock);
> -
> -	return data;
>  
> - err:
> -	kfree(data);
> -	return NULL;
> -}
> -
> -/**
> - * scsi_finish_async_scan - asynchronous scan has finished
> - * @data: cookie returned from earlier call to scsi_prep_async_scan()
> - *
> - * All the devices currently attached to this host have been found.
> - * This function announces all the devices it has found to the rest
> - * of the system.
> - */
> -static void scsi_finish_async_scan(struct async_scan_data *data)
> -{
> -	struct Scsi_Host *shost;
> -	unsigned long flags;
> +	if (shost->hostt->scan_finished) {
> +		unsigned long start = jiffies;
> +		if (shost->hostt->scan_start)
> +			shost->hostt->scan_start(shost);
>  
> -	if (!data)
> -		return;
> +		while (!shost->hostt->scan_finished(shost, jiffies - start))
> +			msleep(10);
> +	} else {
> +		scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
> +				SCAN_WILD_CARD, 0);
> +	}
>  
> -	shost = data->shost;
> +	async_synchronize_cookie_domain(cookie, &scan_domain);
>  
>  	mutex_lock(&shost->scan_mutex);
>  
> -	if (!shost->async_scan) {
> -		printk("%s called twice for host %d", __func__,
> -				shost->host_no);
> -		dump_stack();
> -		mutex_unlock(&shost->scan_mutex);
> -		return;
> -	}
> -
> -	wait_for_completion(&data->prev_finished);
> -
>  	scsi_sysfs_add_devices(shost);
>  
>  	spin_lock_irqsave(shost->host_lock, flags);
> @@ -1801,40 +1699,7 @@ static void scsi_finish_async_scan(struct async_scan_data *data)
>  
>  	mutex_unlock(&shost->scan_mutex);
>  
> -	spin_lock(&async_scan_lock);
> -	list_del(&data->list);
> -	if (!list_empty(&scanning_hosts)) {
> -		struct async_scan_data *next = list_entry(scanning_hosts.next,
> -				struct async_scan_data, list);
> -		complete(&next->prev_finished);
> -	}
> -	spin_unlock(&async_scan_lock);
> -
>  	scsi_host_put(shost);
> -	kfree(data);
> -}
> -
> -static void do_scsi_scan_host(struct Scsi_Host *shost)
> -{
> -	if (shost->hostt->scan_finished) {
> -		unsigned long start = jiffies;
> -		if (shost->hostt->scan_start)
> -			shost->hostt->scan_start(shost);
> -
> -		while (!shost->hostt->scan_finished(shost, jiffies - start))
> -			msleep(10);
> -	} else {
> -		scsi_scan_host_selected(shost, SCAN_WILD_CARD, SCAN_WILD_CARD,
> -				SCAN_WILD_CARD, 0);
> -	}
> -}
> -
> -static int do_scan_async(void *_data)
> -{
> -	struct async_scan_data *data = _data;
> -	do_scsi_scan_host(data->shost);
> -	scsi_finish_async_scan(data);
> -	return 0;
>  }
>  
>  /**
> @@ -1843,21 +1708,16 @@ static int do_scan_async(void *_data)
>   **/
>  void scsi_scan_host(struct Scsi_Host *shost)
>  {
> -	struct task_struct *p;
> -	struct async_scan_data *data;
> +	async_cookie_t cookie;
>  
>  	if (strncmp(scsi_scan_type, "none", 4) == 0)
>  		return;
>  
> -	data = scsi_prep_async_scan(shost);
> -	if (!data) {
> -		do_scsi_scan_host(shost);
> -		return;
> -	}
> -
> -	p = kthread_run(do_scan_async, data, "scsi_scan_%d", shost->host_no);
> -	if (IS_ERR(p))
> -		do_scan_async(data);
> +	scsi_host_get(shost);
> +	cookie = async_schedule_domain(async_scsi_scan_host, shost,
> +								&scan_domain);
> +	if (strncmp(scsi_scan_type, "sync", 4) == 0)
> +		async_synchronize_cookie_domain(cookie, &scan_domain);
>  }
>  EXPORT_SYMBOL(scsi_scan_host);
>  
> 
> -- 
> Matthew Wilcox				Intel Open Source Technology Centre
> "Bill, look, we understand that you're interested in selling us this
> operating system, but compare it to ours.  We can't possibly take such
> a retrograde step."
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-04-28 19:35 [PATCH] Convert scsi_scan to use generic async mechanism Matthew Wilcox
  2009-04-29 14:39 ` Matthew Wilcox
@ 2009-05-23 16:21 ` James Bottomley
  2009-05-23 16:51   ` Arjan van de Ven
  2009-05-23 20:39 ` James Bottomley
  2 siblings, 1 reply; 7+ messages in thread
From: James Bottomley @ 2009-05-23 16:21 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-scsi, Arjan van de Ven, Brian King

On Tue, 2009-04-28 at 13:35 -0600, Matthew Wilcox wrote:
> The new generic async scanning infrastructure is a perfect replacement
> for the scsi async scanning code.  We do need to use a separate domain
> as libata drivers will deadlock waiting for themselves to complete if
> we don't.  Tested with 515 LUNs (3 on AHCI, two fibre channel cards,
> each with two targets, each with 128 LUNs).

I'm afraid this patch fails in testing with the ipr driver by causing a
boot hang:


INFO: task modprobe:424 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
modprobe      D 000000000ff61bb4     0   424      1
Call Trace:
[c00000007875af50] [c00000007875b000] 0xc00000007875b000 (unreliable)
[c00000007875b120] [c0000000000121fc] .__switch_to+0x14c/0x1ac
[c00000007875b1b0] [c00000000039e9ec] .__schedule+0x9c4/0xaa8
[c00000007875b2e0] [c00000000039eaec] .schedule+0x1c/0x3c
[c00000007875b360] [c000000000085ab0] .async_synchronize_cookie_domain
+0xec/0x178
[c00000007875b440] [d000000000ca00d8] .__scsi_add_device+0xb0/0x130
[scsi_mod]
[c00000007875b500] [d000000000ca016c] .scsi_add_device+0x14/0x44
[scsi_mod]
[c00000007875b570] [d000000000e77094] .ipr_probe+0x11d4/0x12d4 [ipr]
[c00000007875b6c0] [c0000000001fe028] .local_pci_probe+0x34/0x48
[c00000007875b730] [c0000000001fed2c] .pci_device_probe+0xe8/0x130
[c00000007875b7e0] [c0000000002ca9f8] .driver_probe_device+0xd4/0x1bc
[c00000007875b880] [c0000000002cab74] .__driver_attach+0x94/0xd8
[c00000007875b910] [c0000000002c9f84] .bus_for_each_dev+0x80/0xe8
[c00000007875b9c0] [c0000000002ca7c8] .driver_attach+0x28/0x40
[c00000007875ba40] [c0000000002c9628] .bus_add_driver+0x138/0x2d8
[c00000007875bae0] [c0000000002cafe8] .driver_register+0xf0/0x1b0
[c00000007875bb80] [c0000000001ff2b8] .__pci_register_driver+0x70/0x11c
[c00000007875bc20] [d000000000e771cc] .ipr_init+0x38/0x1af4 [ipr]
[c00000007875bca0] [c0000000000092d8] .do_one_initcall+0x80/0x1a4
[c00000007875bd90] [c00000000009f468] .SyS_init_module+0xd8/0x240
[c00000007875be30] [c000000000008554] syscall_exit+0x0/0x40
1 lock held by modprobe/424:
 #0:  (&shost->scan_mutex){+.+...}, at:
[<d000000000ca00c0>] .__scsi_add_device+0x98/0x130 [scsi_mod]

(This kernel was configured for SYNC scanning).

The problem has its roots in the way the ipr driver works.  ipr is a
hybrid SCSI/RAID card, very much in the mold of fusion.  However, unlike
fusion it treats everything as a RAID, so my single pass through SAS
disk on an ipr card is presented natively, it's not attached to the SAS
transports.

The problem is in ipr.c:7612 (it's trying to make the device visible
using scsi_add_device) and hanging.

The device it's trying to add is this one:

Host: scsi0 Channel: 255 Id: 255 Lun: 255
  Vendor: IBM      Model: 572C001SISIOA    Rev: 0150
  Type:   Unknown                          ANSI SCSI revision: 03

The reason scsi_add_device() is failing seems to be that
async_synchronize_full_domain() is a bit fragile in that it only expects
to be called once.  Call it again, like we do, to make sure there aren't
any outstanding scans and it hangs on the wait event.

This simplest fix might be just to take the async wait out of our sync
methods, like the patch below.  Alternatively, perhaps
async_synchronize_full_domain() should be made a bit more robust?

James

---
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 7d7db71..e449435 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -1472,8 +1472,6 @@ struct scsi_device *__scsi_add_device(struct Scsi_Host *shost, uint channel,
 		return ERR_PTR(-ENOMEM);
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
-		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost))
 		scsi_probe_and_add_lun(starget, lun, NULL, &sdev, 1, hostdata);
@@ -1587,8 +1585,6 @@ void scsi_scan_target(struct device *parent, unsigned int channel,
 		return;
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
-		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost))
 		__scsi_scan_target(parent, channel, id, lun, rescan);
@@ -1640,8 +1636,6 @@ int scsi_scan_host_selected(struct Scsi_Host *shost, unsigned int channel,
 		return -EINVAL;
 
 	mutex_lock(&shost->scan_mutex);
-	if (!shost->async_scan)
-		scsi_complete_async_scans();
 
 	if (scsi_host_scan_allowed(shost)) {
 		if (channel == SCAN_WILD_CARD)






^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-05-23 16:21 ` James Bottomley
@ 2009-05-23 16:51   ` Arjan van de Ven
  2009-05-23 17:07     ` James Bottomley
  2009-05-23 20:42     ` James Bottomley
  0 siblings, 2 replies; 7+ messages in thread
From: Arjan van de Ven @ 2009-05-23 16:51 UTC (permalink / raw)
  To: James Bottomley; +Cc: Matthew Wilcox, linux-scsi, Brian King

On Sat, 23 May 2009 11:21:43 -0500
> The reason scsi_add_device() is failing seems to be that
> async_synchronize_full_domain() is a bit fragile in that it only
> expects to be called once.  Call it again, like we do, to make sure
> there aren't any outstanding scans and it hangs on the wait event.

it's supposed to be ok to call as many times as you want.
What is NOT allowed is calling it from async work itself, due to the
obvious deadlock.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-05-23 16:51   ` Arjan van de Ven
@ 2009-05-23 17:07     ` James Bottomley
  2009-05-23 20:42     ` James Bottomley
  1 sibling, 0 replies; 7+ messages in thread
From: James Bottomley @ 2009-05-23 17:07 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Matthew Wilcox, linux-scsi, Brian King

On Sat, 2009-05-23 at 09:51 -0700, Arjan van de Ven wrote:
> On Sat, 23 May 2009 11:21:43 -0500
> > The reason scsi_add_device() is failing seems to be that
> > async_synchronize_full_domain() is a bit fragile in that it only
> > expects to be called once.  Call it again, like we do, to make sure
> > there aren't any outstanding scans and it hangs on the wait event.
> 
> it's supposed to be ok to call as many times as you want.
> What is NOT allowed is calling it from async work itself, due to the
> obvious deadlock.

The call trace doesn't seem to show we're nested ... although I suppose
we could eventually be if someone decided to do async PCI scanning.

Any other thoughts as to why this would be hanging?

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-04-28 19:35 [PATCH] Convert scsi_scan to use generic async mechanism Matthew Wilcox
  2009-04-29 14:39 ` Matthew Wilcox
  2009-05-23 16:21 ` James Bottomley
@ 2009-05-23 20:39 ` James Bottomley
  2 siblings, 0 replies; 7+ messages in thread
From: James Bottomley @ 2009-05-23 20:39 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: linux-scsi, Arjan van de Ven

On Tue, 2009-04-28 at 13:35 -0600, Matthew Wilcox wrote:
> +	scsi_host_get(shost);
> +	cookie = async_schedule_domain(async_scsi_scan_host, shost,
> +								&scan_domain);
> +	if (strncmp(scsi_scan_type, "sync", 4) == 0)
> +		async_synchronize_cookie_domain(cookie, &scan_domain);

This last statement doesn't actually do anything.  It waits for all
prior scans to complete, but not this one, so we're not actually
synchronous here ... we don't wait for the scan we just kicked off to
complete.

Arjan suggests a way to hack around this might be
async_synchronize_cookie_domain(cookie + 1, &scan_domain);

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] Convert scsi_scan to use generic async mechanism
  2009-05-23 16:51   ` Arjan van de Ven
  2009-05-23 17:07     ` James Bottomley
@ 2009-05-23 20:42     ` James Bottomley
  1 sibling, 0 replies; 7+ messages in thread
From: James Bottomley @ 2009-05-23 20:42 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: Matthew Wilcox, linux-scsi, Brian King

On Sat, 2009-05-23 at 09:51 -0700, Arjan van de Ven wrote:
> On Sat, 23 May 2009 11:21:43 -0500
> > The reason scsi_add_device() is failing seems to be that
> > async_synchronize_full_domain() is a bit fragile in that it only
> > expects to be called once.  Call it again, like we do, to make sure
> > there aren't any outstanding scans and it hangs on the wait event.
> 
> it's supposed to be ok to call as many times as you want.
> What is NOT allowed is calling it from async work itself, due to the
> obvious deadlock.

OK, this turns out to be a classic ABBA deadlock.

async_synchronize_domain() is one waiter and the scan mutex is the
other.  What's happening is that scsi_add_device() takes the scan mutex
and then waits for the async scan thread to complete.  Meanwhile the
async thread is dropping and reacquiring the mutex as it moves from
scanning to adding devices.  Result: Deadlock.

I think a reasonable fix is to take the scan mutex *after* waiting for
the async scans to complete.  An alternative might be to have a version
of scsi_scan_host_selected that doesn't take the mutex so we can hold it
entirely over async_scsi_scan_host(), but that gets a bit messy.

I'll drop the async scan conversion patch until we can get this all
sorted out.

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-05-23 20:42 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-04-28 19:35 [PATCH] Convert scsi_scan to use generic async mechanism Matthew Wilcox
2009-04-29 14:39 ` Matthew Wilcox
2009-05-23 16:21 ` James Bottomley
2009-05-23 16:51   ` Arjan van de Ven
2009-05-23 17:07     ` James Bottomley
2009-05-23 20:42     ` James Bottomley
2009-05-23 20:39 ` James Bottomley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).