[PATCH 7/7] scsi: Add 'eh_deadline' to limit SCSI EH runtime

linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Hannes Reinecke <hare@suse.de>
To: James Bottomley <jbottomley@parallels.com>
Cc: linux-scsi@vger.kernel.org, Joern Engel <joern@logfs.org>,
	Ewan Milne <emilne@redhat.com>,
	James Smart <james.smart@emulex.com>,
	Ren Mingxin <renmx@cn.fujitsu.com>,
	Roland Dreier <roland@purestorage.com>,
	Bryn Reeves <bmr@redhat.com>,
	Christoph Hellwig <hch@infradead.org>,
	Hannes Reinecke <hare@suse.de>
Subject: [PATCH 7/7] scsi: Add 'eh_deadline' to limit SCSI EH runtime
Date: Mon, 10 Jun 2013 13:11:53 +0200	[thread overview]
Message-ID: <1370862713-41323-8-git-send-email-hare@suse.de> (raw)
In-Reply-To: <1370862713-41323-1-git-send-email-hare@suse.de>

This patchs adds an 'eh_deadline' attribute to the scsi
host which limits the overall runtime of the SCSI EH.
When a command is failed the start time of the EH is stored
in 'last_reset'. If the overall runtime of the SCSI EH is longer
than last_reset + eh_deadline, the EH is short-circuited and
falls through to issue a host reset only.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/scsi/hosts.c      |   7 +++
 drivers/scsi/scsi_error.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/scsi_sysfs.c |  37 ++++++++++++
 include/scsi/scsi_host.h  |   2 +-
 4 files changed, 180 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index df0c3c7..c8d828f 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -316,6 +316,12 @@ static void scsi_host_dev_release(struct device *dev)
 	kfree(shost);
 }
 
+static unsigned int shost_eh_deadline;
+
+module_param_named(eh_deadline, shost_eh_deadline, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(eh_deadline,
+		 "SCSI EH deadline in seconds (should be between 1 and 2^32-1)");
+
 static struct device_type scsi_host_type = {
 	.name =		"scsi_host",
 	.release =	scsi_host_dev_release,
@@ -388,6 +394,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	shost->unchecked_isa_dma = sht->unchecked_isa_dma;
 	shost->use_clustering = sht->use_clustering;
 	shost->ordered_tag = sht->ordered_tag;
+	shost->eh_deadline = shost_eh_deadline;
 
 	if (sht->supported_mode == MODE_UNKNOWN)
 		/* means we didn't set it ... default to INITIATOR */
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index 467cb3c..cf30475 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -91,6 +91,31 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
 }
 EXPORT_SYMBOL_GPL(scsi_schedule_eh);
 
+static int sdev_eh_deadline(struct Scsi_Host *shost,
+			   unsigned long eh_start)
+{
+	if (!shost->eh_deadline)
+		return 0;
+
+	if (shost->last_reset != 0 &&
+	    time_before(shost->last_reset, eh_start))
+		eh_start = shost->last_reset;
+
+	if (time_before(jiffies,
+			eh_start + shost->eh_deadline))
+		return 0;
+
+	return 1;
+}
+
+static int scsi_host_eh_deadline(struct Scsi_Host *shost)
+{
+	if (!shost->last_reset)
+		return 0;
+
+	return sdev_eh_deadline(shost, shost->last_reset);
+}
+
 /**
  * scsi_eh_abort_handler - Handle command aborts
  * @work:	sdev on which commands should be aborted.
@@ -102,13 +127,15 @@ scsi_eh_abort_handler(struct work_struct *work)
 		container_of(work, struct scsi_device, abort_work);
 	struct scsi_cmnd *scmd, *tmp;
 	LIST_HEAD(abort_list);
-	unsigned long flags;
+	unsigned long flags, eh_start;
 	int rtn;
 
 	spin_lock_irqsave(&sdev->list_lock, flags);
 	list_splice_init(&sdev->eh_abort_list, &abort_list);
 	spin_unlock_irqrestore(&sdev->list_lock, flags);
 
+	eh_start = jiffies;
+
 	list_for_each_entry_safe(scmd, tmp, &abort_list, eh_entry) {
 		list_del_init(&scmd->eh_entry);
 		if (sdev->sdev_state == SDEV_CANCEL) {
@@ -119,6 +146,13 @@ scsi_eh_abort_handler(struct work_struct *work)
 			scsi_finish_command(scmd);
 			continue;
 		}
+		if (sdev_eh_deadline(sdev->host, eh_start)) {
+			SCSI_LOG_ERROR_RECOVERY(3,
+				scmd_printk(KERN_INFO, scmd,
+					     "eh timeout, not aborting\n"));
+			list_move_tail(&scmd->eh_entry, &abort_list);
+			goto start_eh;
+		}
 		SCSI_LOG_ERROR_RECOVERY(3,
 			scmd_printk(KERN_INFO, scmd,
 				    "aborting command %p\n", scmd));
@@ -151,6 +185,12 @@ scsi_eh_abort_handler(struct work_struct *work)
 		return;
 
 start_eh:
+	spin_lock_irqsave(sdev->host->host_lock, flags);
+	if (sdev->host->eh_deadline &&
+	    (!sdev->host->last_reset ||
+	     time_before(eh_start, sdev->host->last_reset)))
+		sdev->host->last_reset = eh_start;
+	spin_unlock_irqrestore(sdev->host->host_lock, flags);
 	list_for_each_entry_safe(scmd, tmp, &abort_list, eh_entry) {
 		scmd->result |= DID_TIME_OUT << 16;
 		if (!scsi_eh_scmd_add(scmd, 0)) {
@@ -232,6 +272,9 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
 		if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
 			goto out_unlock;
 
+	if (sdev->eh_deadline && !shost->last_reset)
+		shost->last_reset = jiffies;
+
 	ret = 1;
 	scmd->eh_eflags |= eh_flag;
 	list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
@@ -1052,13 +1095,25 @@ int scsi_eh_get_sense(struct list_head *work_q,
 		      struct list_head *done_q)
 {
 	struct scsi_cmnd *scmd, *next;
+	struct Scsi_Host *shost;
 	int rtn;
+	unsigned long flags;
 
 	list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
 		if ((scmd->eh_eflags & SCSI_EH_CANCEL_CMD) ||
 		    SCSI_SENSE_VALID(scmd))
 			continue;
 
+		shost = scmd->device->host;
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			break;
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
 		SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd,
 						  "%s: requesting sense\n",
 						  current->comm));
@@ -1143,11 +1198,22 @@ static int scsi_eh_test_devices(struct list_head *cmd_list,
 	struct scsi_cmnd *scmd, *next;
 	struct scsi_device *sdev;
 	int finish_cmds;
+	unsigned long flags;
 
 	while (!list_empty(cmd_list)) {
 		scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry);
 		sdev = scmd->device;
 
+		if (!try_stu) {
+			spin_lock_irqsave(sdev->host->host_lock, flags);
+			if (scsi_host_eh_deadline(sdev->host)) {
+				spin_unlock_irqrestore(sdev->host->host_lock,
+						       flags);
+				break;
+			}
+			spin_unlock_irqrestore(sdev->host->host_lock, flags);
+		}
+
 		finish_cmds = !scsi_device_online(scmd->device) ||
 			(try_stu && !scsi_eh_try_stu(scmd) &&
 			 !scsi_eh_tur(scmd)) ||
@@ -1183,14 +1249,26 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
 	struct scsi_cmnd *scmd, *next;
 	LIST_HEAD(check_list);
 	int rtn;
+	struct Scsi_Host *shost;
+	unsigned long flags;
 
 	list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
 		if (!(scmd->eh_eflags & SCSI_EH_CANCEL_CMD))
 			continue;
+		shost = scmd->device->host;
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			return 1;
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
 		SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
 						  "0x%p\n", current->comm,
 						  scmd));
-		rtn = scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd);
+		rtn = scsi_try_to_abort_cmd(shost->hostt, scmd);
 		if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
 			scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
 			if (rtn == FAST_IO_FAIL)
@@ -1248,8 +1326,18 @@ static int scsi_eh_stu(struct Scsi_Host *shost,
 {
 	struct scsi_cmnd *scmd, *stu_scmd, *next;
 	struct scsi_device *sdev;
+	unsigned long flags;
 
 	shost_for_each_device(sdev, shost) {
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			break;
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
 		stu_scmd = NULL;
 		list_for_each_entry(scmd, work_q, eh_entry)
 			if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) &&
@@ -1302,9 +1390,19 @@ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
 {
 	struct scsi_cmnd *scmd, *bdr_scmd, *next;
 	struct scsi_device *sdev;
+	unsigned long flags;
 	int rtn;
 
 	shost_for_each_device(sdev, shost) {
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			break;
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
 		bdr_scmd = NULL;
 		list_for_each_entry(scmd, work_q, eh_entry)
 			if (scmd->device == sdev) {
@@ -1364,6 +1462,19 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
 		struct scsi_cmnd *next, *scmd;
 		int rtn;
 		unsigned int id;
+		unsigned long flags;
+
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			/* push back on work queue for further processing */
+			list_splice_init(&tmp_list, work_q);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			return list_empty(work_q);
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
 
 		scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry);
 		id = scmd_id(scmd);
@@ -1408,6 +1519,7 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
 	LIST_HEAD(check_list);
 	unsigned int channel;
 	int rtn;
+	unsigned long flags;
 
 	/*
 	 * we really want to loop over the various channels, and do this on
@@ -1417,6 +1529,16 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
 	 */
 
 	for (channel = 0; channel <= shost->max_channel; channel++) {
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_eh_deadline(shost)) {
+			spin_unlock_irqrestore(shost->host_lock, flags);
+			SCSI_LOG_ERROR_RECOVERY(3,
+				shost_printk(KERN_INFO, shost,
+					    "skip %s, eh timeout\n", __func__));
+			return list_empty(work_q);
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
+
 		chan_scmd = NULL;
 		list_for_each_entry(scmd, work_q, eh_entry) {
 			if (channel == scmd_channel(scmd)) {
@@ -1822,8 +1944,9 @@ static void scsi_restart_operations(struct Scsi_Host *shost)
 	 * will be requests for character device operations, and also for
 	 * ioctls to queued block devices.
 	 */
-	SCSI_LOG_ERROR_RECOVERY(3, printk("%s: waking up host to restart\n",
-					  __func__));
+	SCSI_LOG_ERROR_RECOVERY(3,
+		printk("scsi_eh_%d waking up host to restart\n",
+		       shost->host_no));
 
 	spin_lock_irqsave(shost->host_lock, flags);
 	if (scsi_host_set_state(shost, SHOST_RUNNING))
@@ -1950,6 +2073,10 @@ static void scsi_unjam_host(struct Scsi_Host *shost)
 		if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
 			scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
 
+	spin_lock_irqsave(shost->host_lock, flags);
+	if (sdev->eh_deadline)
+		shost->last_reset = 0;
+	spin_unlock_irqrestore(shost->host_lock, flags);
 	scsi_eh_flush_done_q(&eh_done_q);
 }
 
@@ -1976,7 +2103,7 @@ int scsi_error_handler(void *data)
 		if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) ||
 		    shost->host_failed != shost->host_busy) {
 			SCSI_LOG_ERROR_RECOVERY(1,
-				printk("Error handler scsi_eh_%d sleeping\n",
+				printk("scsi_eh_%d: sleeping\n",
 					shost->host_no));
 			schedule();
 			continue;
@@ -1984,8 +2111,9 @@ int scsi_error_handler(void *data)
 
 		__set_current_state(TASK_RUNNING);
 		SCSI_LOG_ERROR_RECOVERY(1,
-			printk("Error handler scsi_eh_%d waking up\n",
-				shost->host_no));
+			printk("scsi_eh_%d: waking up %d/%d/%d\n",
+			       shost->host_no, shost->host_eh_scheduled,
+			       shost->host_failed, shost->host_busy));
 
 		/*
 		 * We have a host that is failing for some reason.  Figure out
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index af64c1c..3c1742f 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -281,6 +281,42 @@ exit_store_host_reset:
 
 static DEVICE_ATTR(host_reset, S_IWUSR, NULL, store_host_reset);
 
+static ssize_t
+show_shost_eh_deadline(struct device *dev,
+		      struct device_attribute *attr, char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+
+	return sprintf(buf, "%d\n", shost->eh_deadline);
+}
+
+static ssize_t
+store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	int ret = -EINVAL;
+	int timeout;
+	unsigned long flags;
+
+	if (shost->transportt->eh_strategy_handler)
+		return ret;
+
+	if (sscanf(buf, "%d\n", &timeout) == 1) {
+		spin_lock_irqsave(shost->host_lock, flags);
+		if (scsi_host_in_recovery(shost))
+			ret = -EBUSY;
+		else {
+			shost->eh_deadline = timeout;
+			ret = count;
+		}
+		spin_unlock_irqrestore(shost->host_lock, flags);
+	}
+	return ret;
+}
+
+static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
+
 shost_rd_attr(unique_id, "%u\n");
 shost_rd_attr(host_busy, "%hu\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
@@ -308,6 +344,7 @@ static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_prot_capabilities.attr,
 	&dev_attr_prot_guard_type.attr,
 	&dev_attr_host_reset.attr,
+	&dev_attr_eh_deadline.attr,
 	NULL
 };
 
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 7552435..ca87486 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -598,7 +598,7 @@ struct Scsi_Host {
 	unsigned int host_eh_scheduled;    /* EH scheduled without command */
     
 	unsigned int host_no;  /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */
-	int resetting; /* if set, it means that last_reset is a valid value */
+	int eh_deadline; /* Deadline for EH runtime */
 	unsigned long last_reset;
 
 	/*
-- 
1.7.12.4

next prev parent reply	other threads:[~2013-06-10 11:12 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-06-10 11:11 [PATCH 0/7] Limit overall SCSI EH runtime Hannes Reinecke
2013-06-10 11:11 ` [PATCH 1/7] dpt_i2o: Remove DPTI_STATE_IOCTL Hannes Reinecke
2013-06-10 11:11 ` [PATCH 2/7] dpt_i2o: return SCSI_MLQUEUE_HOST_BUSY when in reset Hannes Reinecke
2013-06-10 11:11 ` [PATCH 3/7] advansys: Remove 'last_reset' references Hannes Reinecke
2013-06-10 11:11 ` [PATCH 4/7] tmscsim: Move 'last_reset' into host structure Hannes Reinecke
2013-06-10 11:11 ` [PATCH 5/7] dc395: Move 'last_reset' into internal " Hannes Reinecke
2013-06-10 11:11 ` [PATCH 6/7] scsi: remove check for 'resetting' Hannes Reinecke
2013-06-10 11:11 ` Hannes Reinecke [this message]
2013-06-27 14:33   ` [PATCH 7/7] scsi: Add 'eh_deadline' to limit SCSI EH runtime Ewan Milne
2013-06-28  7:14     ` Hannes Reinecke
2013-06-28 12:54       ` Ewan Milne
2013-06-28  7:29   ` Bart Van Assche
2013-06-28  7:42     ` Hannes Reinecke
2013-06-27  9:23 ` [PATCH 0/7] Limit overall " Ren Mingxin
  -- strict thread matches above, loose matches on Subject: below --
2013-07-01  6:50 [PATCHv2 " Hannes Reinecke
2013-07-01  6:50 ` [PATCH 7/7] scsi: Add 'eh_deadline' to limit " Hannes Reinecke
2013-09-20  7:48   ` Ren Mingxin
2013-10-16 19:22   ` James Bottomley
2013-10-17 14:27     ` Ewan Milne
2013-10-23  9:25     ` Hannes Reinecke
2013-10-23  7:46       ` James Bottomley
2013-10-23  9:49         ` Hannes Reinecke

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:df0c3c7 dfblob:c8d828f dfblob:467cb3c dfblob:cf30475
dfblob:af64c1c dfblob:3c1742f dfblob:7552435 dfblob:ca87486 )
 OR (
bs:"[PATCH 7/7] scsi: Add 'eh_deadline' to limit SCSI EH runtime" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1370862713-41323-8-git-send-email-hare@suse.de \
    --to=hare@suse.de \
    --cc=bmr@redhat.com \
    --cc=emilne@redhat.com \
    --cc=hch@infradead.org \
    --cc=james.smart@emulex.com \
    --cc=jbottomley@parallels.com \
    --cc=joern@logfs.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=renmx@cn.fujitsu.com \
    --cc=roland@purestorage.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).