linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command retry
       [not found] <1391579254-26204-1-git-send-email-eiichi.tsukata.xh@hitachi.com>
@ 2014-02-05  5:47 ` Eiichi Tsukata
  2014-02-05 16:55   ` James Bottomley
  0 siblings, 1 reply; 7+ messages in thread
From: Eiichi Tsukata @ 2014-02-05  5:47 UTC (permalink / raw)
  To: eiichi.tsukata.xh; +Cc: James E.J. Bottomley, linux-scsi, linux-kernel

Currently, scsi error handling in scsi_decide_disposition() tries to
unconditionally requeue scsi command when device keeps some error state.
This is because retryable errors are thought to be temporary and the scsi
device will soon recover from those errors. Normally, such retry policy is
appropriate because the device will soon recover from temporary error state.

But there is no guarantee that device is able to recover from error state
immediately. Some hardware error may prevent device from recovering.
Therefore hardware error can results in infinite command retry loop.

This patchs adds 'retry_timeout' sysfs attribute which limits the retry time
of each scsi command. Once scsi command retry time is longer than this timeout,
the command is treated as failure. 'retry_timeout' is set to '0' by default
which means no timeout set.

Signed-off-by: Eiichi Tsukata <eiichi.tsukata.xh@hitachi.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-scsi@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/scsi/scsi_lib.c    | 22 +++++++++++++++++++++-
 drivers/scsi/scsi_scan.c   |  1 +
 drivers/scsi/scsi_sysfs.c  | 32 ++++++++++++++++++++++++++++++++
 include/scsi/scsi.h        |  1 +
 include/scsi/scsi_device.h |  1 +
 5 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7bd7f0d..a9db69e 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1492,6 +1492,23 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	blk_complete_request(req);
 }
 
+/*
+ * Check if scsi command excessed retry timeout
+ */
+static int scsi_retry_timed_out(struct scsi_cmnd *cmd)
+{
+	unsigned int retry_timeout;
+
+	retry_timeout = cmd->device->retry_timeout;
+	if (retry_timeout &&
+	    time_before(cmd->jiffies_at_alloc + retry_timeout, jiffies)) {
+		scmd_printk(KERN_INFO, cmd, "retry timeout\n");
+		return 1;
+	}
+
+	return 0;
+}
+
 static void scsi_softirq_done(struct request *rq)
 {
 	struct scsi_cmnd *cmd = rq->special;
@@ -1512,7 +1529,10 @@ static void scsi_softirq_done(struct request *rq)
 			    wait_for/HZ);
 		disposition = SUCCESS;
 	}
-			
+
+	if (scsi_retry_timed_out(cmd))
+		disposition = FAILED;
+
 	scsi_log_completion(cmd, disposition);
 
 	switch (disposition) {
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 307a811..4ab044a 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -925,6 +925,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 		sdev->no_dif = 1;
 
 	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
+	sdev->retry_timeout = SCSI_DEFAULT_RETRY_TIMEOUT;
 
 	if (*bflags & BLIST_SKIP_VPD_PAGES)
 		sdev->skip_vpd_pages = 1;
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 8ff62c2..eaa2118 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -627,6 +627,37 @@ sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
 
 static ssize_t
+sdev_show_retry_timeout(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct scsi_device *sdev;
+	sdev = to_scsi_device(dev);
+	return snprintf(buf, 20, "%u\n", sdev->retry_timeout / HZ);
+}
+
+static ssize_t
+sdev_store_retry_timeout(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct scsi_device *sdev;
+	unsigned int retry_timeout;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	sdev = to_scsi_device(dev);
+	err = kstrtouint(buf, 10, &retry_timeout);
+	if (err)
+		return err;
+	sdev->retry_timeout = retry_timeout * HZ;
+
+	return count;
+}
+static DEVICE_ATTR(retry_timeout, S_IRUGO | S_IWUSR,
+		   sdev_show_retry_timeout, sdev_store_retry_timeout);
+
+static ssize_t
 store_rescan_field (struct device *dev, struct device_attribute *attr,
 		    const char *buf, size_t count)
 {
@@ -797,6 +828,7 @@ static struct attribute *scsi_sdev_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_timeout.attr,
 	&dev_attr_eh_timeout.attr,
+	&dev_attr_retry_timeout.attr,
 	&dev_attr_iocounterbits.attr,
 	&dev_attr_iorequest_cnt.attr,
 	&dev_attr_iodone_cnt.attr,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 66d42ed..545408d 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -16,6 +16,7 @@ struct scsi_cmnd;
 
 enum scsi_timeouts {
 	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
+	SCSI_DEFAULT_RETRY_TIMEOUT	= 0,	/* disabled by default */
 };
 
 /*
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index d65fbec..04fc5ee 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -121,6 +121,7 @@ struct scsi_device {
 				 * pass settings from slave_alloc to scsi
 				 * core. */
 	unsigned int eh_timeout; /* Error handling timeout */
+	unsigned int retry_timeout;	/* Command retry timeout */
 	unsigned writeable:1;
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-05  5:47 ` [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command retry Eiichi Tsukata
@ 2014-02-05 16:55   ` James Bottomley
  2014-02-06  4:11     ` Eiichi Tsukata
  0 siblings, 1 reply; 7+ messages in thread
From: James Bottomley @ 2014-02-05 16:55 UTC (permalink / raw)
  To: eiichi.tsukata.xh@hitachi.com
  Cc: linux-scsi@vger.kernel.org, linux-kernel@vger.kernel.org

On Wed, 2014-02-05 at 14:47 +0900, Eiichi Tsukata wrote:
> Currently, scsi error handling in scsi_decide_disposition() tries to
> unconditionally requeue scsi command when device keeps some error state.
> This is because retryable errors are thought to be temporary and the scsi
> device will soon recover from those errors. Normally, such retry policy is
> appropriate because the device will soon recover from temporary error state.

This description isn't very descriptive.  I presume you're talking about
the ADD_TO_MLQUEUE return from scsi_decide_disposition()?

> But there is no guarantee that device is able to recover from error state
> immediately. Some hardware error may prevent device from recovering.
> Therefore hardware error can results in infinite command retry loop.

If you're talking about ADD_TO_MLQUEUE, I don't believe this is correct:
there's a test in scsi_softirq_done() that makes sure the maximum
lifetime is retries*timeout and fails the command after that.

James


> This patchs adds 'retry_timeout' sysfs attribute which limits the retry time
> of each scsi command. Once scsi command retry time is longer than this timeout,
> the command is treated as failure. 'retry_timeout' is set to '0' by default
> which means no timeout set.
> 
> Signed-off-by: Eiichi Tsukata <eiichi.tsukata.xh@hitachi.com>
> Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
> Cc: linux-scsi@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  drivers/scsi/scsi_lib.c    | 22 +++++++++++++++++++++-
>  drivers/scsi/scsi_scan.c   |  1 +
>  drivers/scsi/scsi_sysfs.c  | 32 ++++++++++++++++++++++++++++++++
>  include/scsi/scsi.h        |  1 +
>  include/scsi/scsi_device.h |  1 +
>  5 files changed, 56 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 7bd7f0d..a9db69e 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1492,6 +1492,23 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>  	blk_complete_request(req);
>  }
>  
> +/*
> + * Check if scsi command excessed retry timeout
> + */
> +static int scsi_retry_timed_out(struct scsi_cmnd *cmd)
> +{
> +	unsigned int retry_timeout;
> +
> +	retry_timeout = cmd->device->retry_timeout;
> +	if (retry_timeout &&
> +	    time_before(cmd->jiffies_at_alloc + retry_timeout, jiffies)) {
> +		scmd_printk(KERN_INFO, cmd, "retry timeout\n");
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  static void scsi_softirq_done(struct request *rq)
>  {
>  	struct scsi_cmnd *cmd = rq->special;
> @@ -1512,7 +1529,10 @@ static void scsi_softirq_done(struct request *rq)
>  			    wait_for/HZ);
>  		disposition = SUCCESS;
>  	}
> -			
> +
> +	if (scsi_retry_timed_out(cmd))
> +		disposition = FAILED;
> +
>  	scsi_log_completion(cmd, disposition);
>  
>  	switch (disposition) {
> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
> index 307a811..4ab044a 100644
> --- a/drivers/scsi/scsi_scan.c
> +++ b/drivers/scsi/scsi_scan.c
> @@ -925,6 +925,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
>  		sdev->no_dif = 1;
>  
>  	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
> +	sdev->retry_timeout = SCSI_DEFAULT_RETRY_TIMEOUT;
>  
>  	if (*bflags & BLIST_SKIP_VPD_PAGES)
>  		sdev->skip_vpd_pages = 1;
> diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
> index 8ff62c2..eaa2118 100644
> --- a/drivers/scsi/scsi_sysfs.c
> +++ b/drivers/scsi/scsi_sysfs.c
> @@ -627,6 +627,37 @@ sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr,
>  static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
>  
>  static ssize_t
> +sdev_show_retry_timeout(struct device *dev,
> +			struct device_attribute *attr, char *buf)
> +{
> +	struct scsi_device *sdev;
> +	sdev = to_scsi_device(dev);
> +	return snprintf(buf, 20, "%u\n", sdev->retry_timeout / HZ);
> +}
> +
> +static ssize_t
> +sdev_store_retry_timeout(struct device *dev, struct device_attribute *attr,
> +			 const char *buf, size_t count)
> +{
> +	struct scsi_device *sdev;
> +	unsigned int retry_timeout;
> +	int err;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +
> +	sdev = to_scsi_device(dev);
> +	err = kstrtouint(buf, 10, &retry_timeout);
> +	if (err)
> +		return err;
> +	sdev->retry_timeout = retry_timeout * HZ;
> +
> +	return count;
> +}
> +static DEVICE_ATTR(retry_timeout, S_IRUGO | S_IWUSR,
> +		   sdev_show_retry_timeout, sdev_store_retry_timeout);
> +
> +static ssize_t
>  store_rescan_field (struct device *dev, struct device_attribute *attr,
>  		    const char *buf, size_t count)
>  {
> @@ -797,6 +828,7 @@ static struct attribute *scsi_sdev_attrs[] = {
>  	&dev_attr_state.attr,
>  	&dev_attr_timeout.attr,
>  	&dev_attr_eh_timeout.attr,
> +	&dev_attr_retry_timeout.attr,
>  	&dev_attr_iocounterbits.attr,
>  	&dev_attr_iorequest_cnt.attr,
>  	&dev_attr_iodone_cnt.attr,
> diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
> index 66d42ed..545408d 100644
> --- a/include/scsi/scsi.h
> +++ b/include/scsi/scsi.h
> @@ -16,6 +16,7 @@ struct scsi_cmnd;
>  
>  enum scsi_timeouts {
>  	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
> +	SCSI_DEFAULT_RETRY_TIMEOUT	= 0,	/* disabled by default */
>  };
>  
>  /*
> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
> index d65fbec..04fc5ee 100644
> --- a/include/scsi/scsi_device.h
> +++ b/include/scsi/scsi_device.h
> @@ -121,6 +121,7 @@ struct scsi_device {
>  				 * pass settings from slave_alloc to scsi
>  				 * core. */
>  	unsigned int eh_timeout; /* Error handling timeout */
> +	unsigned int retry_timeout;	/* Command retry timeout */
>  	unsigned writeable:1;
>  	unsigned removable:1;
>  	unsigned changed:1;	/* Data invalid due to media change */



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-05 16:55   ` James Bottomley
@ 2014-02-06  4:11     ` Eiichi Tsukata
  2014-02-07  0:22       ` [PATCH v2] " Eiichi Tsukata
  0 siblings, 1 reply; 7+ messages in thread
From: Eiichi Tsukata @ 2014-02-06  4:11 UTC (permalink / raw)
  To: James Bottomley
  Cc: linux-scsi@vger.kernel.org, linux-kernel@vger.kernel.org,
	yrl.pp-manager.tt

(2014/02/06 1:55), James Bottomley wrote:
> On Wed, 2014-02-05 at 14:47 +0900, Eiichi Tsukata wrote:
>> Currently, scsi error handling in scsi_decide_disposition() tries to
>> unconditionally requeue scsi command when device keeps some error state.
>> This is because retryable errors are thought to be temporary and the scsi
>> device will soon recover from those errors. Normally, such retry policy is
>> appropriate because the device will soon recover from temporary error state.
> This description isn't very descriptive.  I presume you're talking about
> the ADD_TO_MLQUEUE return from scsi_decide_disposition()?
>

Thanks for reviewing, James.

I was talking about ADD_TO_MLQUEUE and NEEDS_RETRY.
I'll fix the description.

>> But there is no guarantee that device is able to recover from error state
>> immediately. Some hardware error may prevent device from recovering.
>> Therefore hardware error can results in infinite command retry loop.
> If you're talking about ADD_TO_MLQUEUE, I don't believe this is correct:
> there's a test in scsi_softirq_done() that makes sure the maximum
> lifetime is retries*timeout and fails the command after that.

I see, threre's already timeout in scsi_softirq_done() so my patch is not correct
as you say.

What I really want to do is to prevent command from retrying indefinitely
in scsi_io_completion()
with action == ACTION_RETRY || action == ACTION_DELAYED_RETRY.
In v2 patch, I'll change the location of retry timeout check from scsi_softirq_done()
to scsi_io_completion().

Eiichi


(2014/02/06 1:55), James Bottomley wrote:
> On Wed, 2014-02-05 at 14:47 +0900, Eiichi Tsukata wrote:
>> Currently, scsi error handling in scsi_decide_disposition() tries to
>> unconditionally requeue scsi command when device keeps some error state.
>> This is because retryable errors are thought to be temporary and the scsi
>> device will soon recover from those errors. Normally, such retry policy is
>> appropriate because the device will soon recover from temporary error state.
> This description isn't very descriptive.  I presume you're talking about
> the ADD_TO_MLQUEUE return from scsi_decide_disposition()?
>
>> But there is no guarantee that device is able to recover from error state
>> immediately. Some hardware error may prevent device from recovering.
>> Therefore hardware error can results in infinite command retry loop.
> If you're talking about ADD_TO_MLQUEUE, I don't believe this is correct:
> there's a test in scsi_softirq_done() that makes sure the maximum
> lifetime is retries*timeout and fails the command after that.
>
> James
>
>
>> This patchs adds 'retry_timeout' sysfs attribute which limits the retry time
>> of each scsi command. Once scsi command retry time is longer than this timeout,
>> the command is treated as failure. 'retry_timeout' is set to '0' by default
>> which means no timeout set.
>>
>> Signed-off-by: Eiichi Tsukata <eiichi.tsukata.xh@hitachi.com>
>> Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
>> Cc: linux-scsi@vger.kernel.org
>> Cc: linux-kernel@vger.kernel.org
>> ---
>>   drivers/scsi/scsi_lib.c    | 22 +++++++++++++++++++++-
>>   drivers/scsi/scsi_scan.c   |  1 +
>>   drivers/scsi/scsi_sysfs.c  | 32 ++++++++++++++++++++++++++++++++
>>   include/scsi/scsi.h        |  1 +
>>   include/scsi/scsi_device.h |  1 +
>>   5 files changed, 56 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index 7bd7f0d..a9db69e 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1492,6 +1492,23 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
>>   	blk_complete_request(req);
>>   }
>>   
>> +/*
>> + * Check if scsi command excessed retry timeout
>> + */
>> +static int scsi_retry_timed_out(struct scsi_cmnd *cmd)
>> +{
>> +	unsigned int retry_timeout;
>> +
>> +	retry_timeout = cmd->device->retry_timeout;
>> +	if (retry_timeout &&
>> +	    time_before(cmd->jiffies_at_alloc + retry_timeout, jiffies)) {
>> +		scmd_printk(KERN_INFO, cmd, "retry timeout\n");
>> +		return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static void scsi_softirq_done(struct request *rq)
>>   {
>>   	struct scsi_cmnd *cmd = rq->special;
>> @@ -1512,7 +1529,10 @@ static void scsi_softirq_done(struct request *rq)
>>   			    wait_for/HZ);
>>   		disposition = SUCCESS;
>>   	}
>> -			
>> +
>> +	if (scsi_retry_timed_out(cmd))
>> +		disposition = FAILED;
>> +
>>   	scsi_log_completion(cmd, disposition);
>>   
>>   	switch (disposition) {
>> diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
>> index 307a811..4ab044a 100644
>> --- a/drivers/scsi/scsi_scan.c
>> +++ b/drivers/scsi/scsi_scan.c
>> @@ -925,6 +925,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
>>   		sdev->no_dif = 1;
>>   
>>   	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
>> +	sdev->retry_timeout = SCSI_DEFAULT_RETRY_TIMEOUT;
>>   
>>   	if (*bflags & BLIST_SKIP_VPD_PAGES)
>>   		sdev->skip_vpd_pages = 1;
>> diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
>> index 8ff62c2..eaa2118 100644
>> --- a/drivers/scsi/scsi_sysfs.c
>> +++ b/drivers/scsi/scsi_sysfs.c
>> @@ -627,6 +627,37 @@ sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr,
>>   static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
>>   
>>   static ssize_t
>> +sdev_show_retry_timeout(struct device *dev,
>> +			struct device_attribute *attr, char *buf)
>> +{
>> +	struct scsi_device *sdev;
>> +	sdev = to_scsi_device(dev);
>> +	return snprintf(buf, 20, "%u\n", sdev->retry_timeout / HZ);
>> +}
>> +
>> +static ssize_t
>> +sdev_store_retry_timeout(struct device *dev, struct device_attribute *attr,
>> +			 const char *buf, size_t count)
>> +{
>> +	struct scsi_device *sdev;
>> +	unsigned int retry_timeout;
>> +	int err;
>> +
>> +	if (!capable(CAP_SYS_ADMIN))
>> +		return -EACCES;
>> +
>> +	sdev = to_scsi_device(dev);
>> +	err = kstrtouint(buf, 10, &retry_timeout);
>> +	if (err)
>> +		return err;
>> +	sdev->retry_timeout = retry_timeout * HZ;
>> +
>> +	return count;
>> +}
>> +static DEVICE_ATTR(retry_timeout, S_IRUGO | S_IWUSR,
>> +		   sdev_show_retry_timeout, sdev_store_retry_timeout);
>> +
>> +static ssize_t
>>   store_rescan_field (struct device *dev, struct device_attribute *attr,
>>   		    const char *buf, size_t count)
>>   {
>> @@ -797,6 +828,7 @@ static struct attribute *scsi_sdev_attrs[] = {
>>   	&dev_attr_state.attr,
>>   	&dev_attr_timeout.attr,
>>   	&dev_attr_eh_timeout.attr,
>> +	&dev_attr_retry_timeout.attr,
>>   	&dev_attr_iocounterbits.attr,
>>   	&dev_attr_iorequest_cnt.attr,
>>   	&dev_attr_iodone_cnt.attr,
>> diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
>> index 66d42ed..545408d 100644
>> --- a/include/scsi/scsi.h
>> +++ b/include/scsi/scsi.h
>> @@ -16,6 +16,7 @@ struct scsi_cmnd;
>>   
>>   enum scsi_timeouts {
>>   	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
>> +	SCSI_DEFAULT_RETRY_TIMEOUT	= 0,	/* disabled by default */
>>   };
>>   
>>   /*
>> diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
>> index d65fbec..04fc5ee 100644
>> --- a/include/scsi/scsi_device.h
>> +++ b/include/scsi/scsi_device.h
>> @@ -121,6 +121,7 @@ struct scsi_device {
>>   				 * pass settings from slave_alloc to scsi
>>   				 * core. */
>>   	unsigned int eh_timeout; /* Error handling timeout */
>> +	unsigned int retry_timeout;	/* Command retry timeout */
>>   	unsigned writeable:1;
>>   	unsigned removable:1;
>>   	unsigned changed:1;	/* Data invalid due to media change */
>
>
> .
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-06  4:11     ` Eiichi Tsukata
@ 2014-02-07  0:22       ` Eiichi Tsukata
  2014-02-07  5:46         ` James Bottomley
  0 siblings, 1 reply; 7+ messages in thread
From: Eiichi Tsukata @ 2014-02-07  0:22 UTC (permalink / raw)
  To: JBottomley, linux-scsi; +Cc: linux-kernel, yrl.pp-manager.tt

Currently, scsi error handling in scsi_io_completion() tries to
unconditionally requeue scsi command when device keeps some error state.
For example, UNIT_ATTENTION causes infinite retry with
action == ACTION_RETRY.
This is because retryable errors are thought to be temporary and the scsi
device will soon recover from those errors. Normally, such retry policy is
appropriate because the device will soon recover from temporary error state.

But there is no guarantee that device is able to recover from error state
immediately. Actually, we've experienced an infinite retry on some hardware.
Therefore hardware error can results in infinite command retry loop.

This patch adds 'retry_timeout' sysfs attribute which limits the retry time
of each scsi command. This attribute is located in scsi sysfs directory
for example "/sys/bus/scsi/devices/X:X:X:X/" and value is in seconds.
Once scsi command retry time is longer than this timeout,
the command is treated as failure. 'retry_timeout' is set to '0' by default
which means no timeout set.

Usage:
 - To set retry timeout(set retry_timeout to 30 sec):
	# echo 30 > /sys/bus/scsi/devices/X:X:X:X/retry_timeout

Changes in v2:
 - check retry timeout in scsi_io_completion() instead of scsi_softirq_done()

Signed-off-by: Eiichi Tsukata <eiichi.tsukata.xh@hitachi.com>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-scsi@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/scsi/scsi_lib.c    |   22 ++++++++++++++++++++++
 drivers/scsi/scsi_scan.c   |    1 +
 drivers/scsi/scsi_sysfs.c  |   32 ++++++++++++++++++++++++++++++++
 include/scsi/scsi.h        |    1 +
 include/scsi/scsi_device.h |    1 +
 5 files changed, 57 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7bd7f0d..813b287 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -741,6 +741,24 @@ static int __scsi_error_from_host_byte(struct scsi_cmnd *cmd, int result)
 }
 
 /*
+ * Check if scsi command excessed retry timeout
+ */
+static int scsi_retry_timed_out(struct scsi_cmnd *cmd)
+{
+	unsigned int retry_timeout;
+
+	retry_timeout = cmd->device->retry_timeout;
+	if (retry_timeout &&
+	    time_before(cmd->jiffies_at_alloc + retry_timeout, jiffies)) {
+		scmd_printk(KERN_ERR, cmd, "retry timeout, waited %us\n",
+			    retry_timeout/HZ);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
  * Function:    scsi_io_completion()
  *
  * Purpose:     Completion processing for block device I/O requests.
@@ -989,6 +1007,10 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		action = ACTION_FAIL;
 	}
 
+	if ((action == ACTION_RETRY || action == ACTION_DELAYED_RETRY) &&
+	    scsi_retry_timed_out(cmd))
+		action = ACTION_FAIL;
+
 	switch (action) {
 	case ACTION_FAIL:
 		/* Give up and fail the remainder of the request */
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 307a811..4ab044a 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -925,6 +925,7 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 		sdev->no_dif = 1;
 
 	sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT;
+	sdev->retry_timeout = SCSI_DEFAULT_RETRY_TIMEOUT;
 
 	if (*bflags & BLIST_SKIP_VPD_PAGES)
 		sdev->skip_vpd_pages = 1;
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 8ff62c2..eaa2118 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -627,6 +627,37 @@ sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr,
 static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout);
 
 static ssize_t
+sdev_show_retry_timeout(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct scsi_device *sdev;
+	sdev = to_scsi_device(dev);
+	return snprintf(buf, 20, "%u\n", sdev->retry_timeout / HZ);
+}
+
+static ssize_t
+sdev_store_retry_timeout(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct scsi_device *sdev;
+	unsigned int retry_timeout;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	sdev = to_scsi_device(dev);
+	err = kstrtouint(buf, 10, &retry_timeout);
+	if (err)
+		return err;
+	sdev->retry_timeout = retry_timeout * HZ;
+
+	return count;
+}
+static DEVICE_ATTR(retry_timeout, S_IRUGO | S_IWUSR,
+		   sdev_show_retry_timeout, sdev_store_retry_timeout);
+
+static ssize_t
 store_rescan_field (struct device *dev, struct device_attribute *attr,
 		    const char *buf, size_t count)
 {
@@ -797,6 +828,7 @@ static struct attribute *scsi_sdev_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_timeout.attr,
 	&dev_attr_eh_timeout.attr,
+	&dev_attr_retry_timeout.attr,
 	&dev_attr_iocounterbits.attr,
 	&dev_attr_iorequest_cnt.attr,
 	&dev_attr_iodone_cnt.attr,
diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 66d42ed..545408d 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -16,6 +16,7 @@ struct scsi_cmnd;
 
 enum scsi_timeouts {
 	SCSI_DEFAULT_EH_TIMEOUT		= 10 * HZ,
+	SCSI_DEFAULT_RETRY_TIMEOUT	= 0,	/* disabled by default */
 };
 
 /*
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index d65fbec..04fc5ee 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -121,6 +121,7 @@ struct scsi_device {
 				 * pass settings from slave_alloc to scsi
 				 * core. */
 	unsigned int eh_timeout; /* Error handling timeout */
+	unsigned int retry_timeout;	/* Command retry timeout */
 	unsigned writeable:1;
 	unsigned removable:1;
 	unsigned changed:1;	/* Data invalid due to media change */


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-07  0:22       ` [PATCH v2] " Eiichi Tsukata
@ 2014-02-07  5:46         ` James Bottomley
  2014-02-07  6:15           ` Libo Chen
  0 siblings, 1 reply; 7+ messages in thread
From: James Bottomley @ 2014-02-07  5:46 UTC (permalink / raw)
  To: Eiichi Tsukata; +Cc: linux-scsi, linux-kernel, yrl.pp-manager.tt

On Fri, 2014-02-07 at 09:22 +0900, Eiichi Tsukata wrote:
> Currently, scsi error handling in scsi_io_completion() tries to
> unconditionally requeue scsi command when device keeps some error state.
> For example, UNIT_ATTENTION causes infinite retry with
> action == ACTION_RETRY.
> This is because retryable errors are thought to be temporary and the scsi
> device will soon recover from those errors. Normally, such retry policy is
> appropriate because the device will soon recover from temporary error state.



> But there is no guarantee that device is able to recover from error state
> immediately. Actually, we've experienced an infinite retry on some hardware.
> Therefore hardware error can results in infinite command retry loop.

Could you please add an analysis of the actual failure; which devices
and what conditions.

> This patch adds 'retry_timeout' sysfs attribute which limits the retry time
> of each scsi command. This attribute is located in scsi sysfs directory
> for example "/sys/bus/scsi/devices/X:X:X:X/" and value is in seconds.
> Once scsi command retry time is longer than this timeout,
> the command is treated as failure. 'retry_timeout' is set to '0' by default
> which means no timeout set.

Don't do this ... you're mixing a feature (which you'd need to justify)
with an apparent bug fix.

Once you dump all the complexity, I think the patch boils down to a
simple check before the action switch in scsi_io_completion():

	if (action !=  ACTION_FAIL &&
	    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
		action = ACTION_FAIL;
		description = "command timed out";
	}


James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-07  5:46         ` James Bottomley
@ 2014-02-07  6:15           ` Libo Chen
  2014-02-11  1:33             ` Eiichi Tsukata
  0 siblings, 1 reply; 7+ messages in thread
From: Libo Chen @ 2014-02-07  6:15 UTC (permalink / raw)
  To: James Bottomley, Eiichi Tsukata
  Cc: linux-scsi, linux-kernel, yrl.pp-manager.tt

On 2014/2/7 13:46, James Bottomley wrote:
> On Fri, 2014-02-07 at 09:22 +0900, Eiichi Tsukata wrote:
>> Currently, scsi error handling in scsi_io_completion() tries to
>> unconditionally requeue scsi command when device keeps some error state.
>> For example, UNIT_ATTENTION causes infinite retry with
>> action == ACTION_RETRY.
>> This is because retryable errors are thought to be temporary and the scsi
>> device will soon recover from those errors. Normally, such retry policy is
>> appropriate because the device will soon recover from temporary error state.
> 
> 
> 
>> But there is no guarantee that device is able to recover from error state
>> immediately. Actually, we've experienced an infinite retry on some hardware.
>> Therefore hardware error can results in infinite command retry loop.
> 
> Could you please add an analysis of the actual failure; which devices
> and what conditions.
> 


same question, can you explain?

>> This patch adds 'retry_timeout' sysfs attribute which limits the retry time
>> of each scsi command. This attribute is located in scsi sysfs directory
>> for example "/sys/bus/scsi/devices/X:X:X:X/" and value is in seconds.
>> Once scsi command retry time is longer than this timeout,
>> the command is treated as failure. 'retry_timeout' is set to '0' by default
>> which means no timeout set.
> 
> Don't do this ... you're mixing a feature (which you'd need to justify)
> with an apparent bug fix.
> 
> Once you dump all the complexity, I think the patch boils down to a
> simple check before the action switch in scsi_io_completion():
> 
> 	if (action !=  ACTION_FAIL &&
> 	    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
> 		action = ACTION_FAIL;
> 		description = "command timed out";
> 	}
> 
> 
> James
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: Re: [PATCH v2] scsi: Add 'retry_timeout' to avoid infinite command retry
  2014-02-07  6:15           ` Libo Chen
@ 2014-02-11  1:33             ` Eiichi Tsukata
  0 siblings, 0 replies; 7+ messages in thread
From: Eiichi Tsukata @ 2014-02-11  1:33 UTC (permalink / raw)
  To: Libo Chen, James Bottomley; +Cc: linux-scsi, linux-kernel, yrl.pp-manager.tt


(2014/02/07 15:15), Libo Chen wrote:
> On 2014/2/7 13:46, James Bottomley wrote:
>> On Fri, 2014-02-07 at 09:22 +0900, Eiichi Tsukata wrote:
>>> Currently, scsi error handling in scsi_io_completion() tries to
>>> unconditionally requeue scsi command when device keeps some error state.
>>> For example, UNIT_ATTENTION causes infinite retry with
>>> action == ACTION_RETRY.
>>> This is because retryable errors are thought to be temporary and the scsi
>>> device will soon recover from those errors. Normally, such retry policy is
>>> appropriate because the device will soon recover from temporary error state.
>>
>>
>>> But there is no guarantee that device is able to recover from error state
>>> immediately. Actually, we've experienced an infinite retry on some hardware.
>>> Therefore hardware error can results in infinite command retry loop.
>> Could you please add an analysis of the actual failure; which devices
>> and what conditions.
>>
>
> same question, can you explain?


I'm afraid to say that I can't expose the device name because I've not 
confirmed yet that
the device is responsible for the problem with the device manufacturer. 
However, with the
limited evidence, It seems that SCSI command retried with UNIT_ATTENTION.

In the previous thread, Ewan reported that a storage array returned a 
CHECK_CONDITION
with invalid sense data, which caused the command to be retried 
indefinitely:
https://lkml.org/lkml/2013/8/20/498

So, we should try to avoid infinite retry in SCSI middle layer, not in 
each SCSI LLDD.

>>> This patch adds 'retry_timeout' sysfs attribute which limits the retry time
>>> of each scsi command. This attribute is located in scsi sysfs directory
>>> for example "/sys/bus/scsi/devices/X:X:X:X/" and value is in seconds.
>>> Once scsi command retry time is longer than this timeout,
>>> the command is treated as failure. 'retry_timeout' is set to '0' by default
>>> which means no timeout set.
>> Don't do this ... you're mixing a feature (which you'd need to justify)
>> with an apparent bug fix.
>>
>> Once you dump all the complexity, I think the patch boils down to a
>> simple check before the action switch in scsi_io_completion():
>>
>> 	if (action !=  ACTION_FAIL &&
>> 	    time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) {
>> 		action = ACTION_FAIL;
>> 		description = "command timed out";
>> 	}
>>

Sounds good!
Thanks for much simpler code. It's enough to fix the bug.
I'll resend the patch soon with the above code.

Eiichi.

>> James
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>>
>
> .
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2014-02-11  1:33 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <1391579254-26204-1-git-send-email-eiichi.tsukata.xh@hitachi.com>
2014-02-05  5:47 ` [REVIEW PATCH] scsi: Add 'retry_timeout' to avoid infinite command retry Eiichi Tsukata
2014-02-05 16:55   ` James Bottomley
2014-02-06  4:11     ` Eiichi Tsukata
2014-02-07  0:22       ` [PATCH v2] " Eiichi Tsukata
2014-02-07  5:46         ` James Bottomley
2014-02-07  6:15           ` Libo Chen
2014-02-11  1:33             ` Eiichi Tsukata

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).