All of lore.kernel.org
 help / color / mirror / Atom feed
From: Bart Van Assche <Bart.VanAssche@wdc.com>
To: "xjtuwjp@gmail.com" <xjtuwjp@gmail.com>
Cc: "jejb@linux.vnet.ibm.com" <jejb@linux.vnet.ibm.com>,
	"linux-scsi@vger.kernel.org" <linux-scsi@vger.kernel.org>,
	"hare@suse.de" <hare@suse.de>,
	"martin.petersen@oracle.com" <martin.petersen@oracle.com>,
	"snitzer@redhat.com" <snitzer@redhat.com>
Subject: Re: [PATCH 0/2] sd: Fix a deadlock between event checking and device removal
Date: Tue, 14 Nov 2017 17:33:50 +0000	[thread overview]
Message-ID: <1510680828.2280.16.camel@sandisk.com> (raw)
In-Reply-To: <CAD+HZHUVsaDYO7yL2Ct868kdK=dE3PuRmy-Jfp2d2zmEBJPwFg@mail.gmail.com>

On Tue, 2017-11-14 at 18:01 +0100, Jack Wang wrote:
> I suspect we run into same bug you were trying to fix in this patch
> set. we're running in v4.4.50
> 
> I was trying to reproduce it, but no lucky yet, do you still have your
> reproducer?

Hello Jack,

I can reproduce this about every fifth run of test one of the srp-test
software and with the SRP initiator and target drivers of what will become
kernel v4.15-rc1 and by switching the ib_srpt driver from non-SRQ to SRQ
mode while the initiator is logging in. I'm currently analyzing where in the
block layer a queue run is missing. The patch below for the sd driver does
not fix the root cause but seems to help.

Bart.


Subject: [PATCH] Increase SCSI disk probing concurrency

---
 drivers/scsi/scsi.c        |  5 -----
 drivers/scsi/scsi_pm.c     |  6 ++++--
 drivers/scsi/scsi_priv.h   |  1 -
 drivers/scsi/sd.c          | 26 +++++++++++++++++++++-----
 drivers/scsi/sd.h          |  1 +
 include/scsi/scsi_driver.h |  1 +
 6 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index a7e4fba724b7..e6d69e647f6a 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -85,10 +85,6 @@ unsigned int scsi_logging_level;
 EXPORT_SYMBOL(scsi_logging_level);
 #endif
 
-/* sd, scsi core and power management need to coordinate flushing async actions */
-ASYNC_DOMAIN(scsi_sd_probe_domain);
-EXPORT_SYMBOL(scsi_sd_probe_domain);
-
 /*
  * Separate domain (from scsi_sd_probe_domain) to maximize the benefit of
  * asynchronous system resume operations.  It is marked 'exclusive' to avoid
@@ -839,7 +835,6 @@ static void __exit exit_scsi(void)
 	scsi_exit_devinfo();
 	scsi_exit_procfs();
 	scsi_exit_queue();
-	async_unregister_domain(&scsi_sd_probe_domain);
 }
 
 subsys_initcall(init_scsi);
diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c
index b44c1bb687a2..d8e43c2f4d40 100644
--- a/drivers/scsi/scsi_pm.c
+++ b/drivers/scsi/scsi_pm.c
@@ -171,9 +171,11 @@ static int scsi_bus_resume_common(struct device *dev,
 static int scsi_bus_prepare(struct device *dev)
 {
 	if (scsi_is_sdev_device(dev)) {
-		/* sd probing uses async_schedule.  Wait until it finishes. */
-		async_synchronize_full_domain(&scsi_sd_probe_domain);
+		struct scsi_driver *drv = to_scsi_driver(dev->driver);
 
+		/* sd probing happens asynchronously. Wait until it finishes. */
+		if (drv->sync)
+			drv->sync(dev);
 	} else if (scsi_is_host_device(dev)) {
 		/* Wait until async scanning is finished */
 		scsi_complete_async_scans();
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index dab29f538612..bf0cadf6a321 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -174,7 +174,6 @@ static inline void scsi_autopm_put_host(struct Scsi_Host *h) {}
 #endif /* CONFIG_PM */
 
 extern struct async_domain scsi_sd_pm_domain;
-extern struct async_domain scsi_sd_probe_domain;
 
 /* scsi_dh.c */
 #ifdef CONFIG_SCSI_DH
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 0313486d85c8..c26dbb38b60c 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -112,6 +112,7 @@ static void sd_shutdown(struct device *);
 static int sd_suspend_system(struct device *);
 static int sd_suspend_runtime(struct device *);
 static int sd_resume(struct device *);
+static void sd_sync_probe_domain(struct device *dev);
 static void sd_rescan(struct device *);
 static int sd_init_command(struct scsi_cmnd *SCpnt);
 static void sd_uninit_command(struct scsi_cmnd *SCpnt);
@@ -564,6 +565,7 @@ static struct scsi_driver sd_template = {
 		.shutdown	= sd_shutdown,
 		.pm		= &sd_pm_ops,
 	},
+	.sync			= sd_sync_probe_domain,
 	.rescan			= sd_rescan,
 	.init_command		= sd_init_command,
 	.uninit_command		= sd_uninit_command,
@@ -3221,9 +3223,9 @@ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen)
 /*
  * The asynchronous part of sd_probe
  */
-static void sd_probe_async(void *data, async_cookie_t cookie)
+static void sd_probe_async(struct work_struct *work)
 {
-	struct scsi_disk *sdkp = data;
+	struct scsi_disk *sdkp = container_of(work, typeof(*sdkp), probe_work);
 	struct scsi_device *sdp;
 	struct gendisk *gd;
 	u32 index;
@@ -3326,6 +3328,8 @@ static int sd_probe(struct device *dev)
 	if (!sdkp)
 		goto out;
 
+	INIT_WORK(&sdkp->probe_work, sd_probe_async);
+
 	gd = alloc_disk(SD_MINORS);
 	if (!gd)
 		goto out_free;
@@ -3377,8 +3381,8 @@ static int sd_probe(struct device *dev)
 	get_device(dev);
 	dev_set_drvdata(dev, sdkp);
 
-	get_device(&sdkp->dev);	/* prevent release before async_schedule */
-	async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain);
+	get_device(&sdkp->dev);	/* prevent release before sd_probe_async() */
+	WARN_ON_ONCE(!queue_work(system_unbound_wq, &sdkp->probe_work));
 
 	return 0;
 
@@ -3395,6 +3399,18 @@ static int sd_probe(struct device *dev)
 	return error;
 }
 
+static void sd_wait_for_probing(struct scsi_disk *sdkp)
+{
+	flush_work(&sdkp->probe_work);
+}
+
+static void sd_sync_probe_domain(struct device *dev)
+{
+	struct scsi_disk *sdkp = dev_get_drvdata(dev);
+
+	sd_wait_for_probing(sdkp);
+}
+
 /**
  *	sd_remove - called whenever a scsi disk (previously recognized by
  *	sd_probe) is detached from the system. It is called (potentially
@@ -3416,7 +3432,7 @@ static int sd_remove(struct device *dev)
 	scsi_autopm_get_device(sdkp->device);
 
 	async_synchronize_full_domain(&scsi_sd_pm_domain);
-	async_synchronize_full_domain(&scsi_sd_probe_domain);
+	sd_wait_for_probing(sdkp);
 	device_del(&sdkp->dev);
 	del_gendisk(sdkp->disk);
 	sd_shutdown(dev);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 7b57dafcd45a..2cc47183c9aa 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -81,6 +81,7 @@ struct scsi_disk {
 	unsigned int	zones_optimal_nonseq;
 	unsigned int	zones_max_open;
 #endif
+	struct work_struct probe_work;
 	atomic_t	openers;
 	sector_t	capacity;	/* size in logical blocks */
 	u32		max_xfer_blocks;
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index a5534ccad859..145d6239eecf 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -11,6 +11,7 @@ struct scsi_device;
 struct scsi_driver {
 	struct device_driver	gendrv;
 
+	void (*sync)(struct device *);
 	void (*rescan)(struct device *);
 	int (*init_command)(struct scsi_cmnd *);
 	void (*uninit_command)(struct scsi_cmnd *);
-- 
2.15.0

  reply	other threads:[~2017-11-14 17:33 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-11-12  0:38 [PATCH 0/2] sd: Fix a deadlock between event checking and device removal Bart Van Assche
2016-11-12  0:39 ` [PATCH 1/2] block, dm-mpath: Introduce request flag REQ_FAIL_IF_NO_PATH Bart Van Assche
2016-11-12  0:40 ` [PATCH 2/2] sd: Fix a deadlock between event checking and device removal Bart Van Assche
2016-11-13  5:47 ` [PATCH 0/2] " James Bottomley
2016-11-14 18:35   ` Bart Van Assche
2017-11-14 17:01     ` Jack Wang
2017-11-14 17:33       ` Bart Van Assche [this message]
2017-11-17 15:14         ` Jack Wang
2017-11-17 17:01           ` Bart Van Assche
2017-11-17 17:10             ` Jack Wang
2017-11-17 17:28               ` Bart Van Assche

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1510680828.2280.16.camel@sandisk.com \
    --to=bart.vanassche@wdc.com \
    --cc=hare@suse.de \
    --cc=jejb@linux.vnet.ibm.com \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=snitzer@redhat.com \
    --cc=xjtuwjp@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.