Re: [PATCH v5 08/15] scsi: ufs: make error handling bit faster

linux-arm-msm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Hannes Reinecke <hare@suse.de>
To: ygardi@codeaurora.org
Cc: james.bottomley@hansenpartnership.com,
	linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-arm-msm@vger.kernel.org, santoshsy@gmail.com,
	linux-scsi-owner@vger.kernel.org,
	Subhash Jadavani <subhashj@codeaurora.org>,
	Vinayak Holikatti <vinholikatti@gmail.com>,
	"James E.J. Bottomley" <jbottomley@odin.com>,
	"Martin K. Petersen" <martin.petersen@oracle.com>
Subject: Re: [PATCH v5 08/15] scsi: ufs: make error handling bit faster
Date: Tue, 1 Mar 2016 18:02:54 +0800	[thread overview]
Message-ID: <56D568CE.9020104@suse.de> (raw)
In-Reply-To: <187f423aa64d09d32c6c136e28c51fc2.squirrel@us.codeaurora.org>

On 03/01/2016 05:56 PM, ygardi@codeaurora.org wrote:
>> On 02/28/2016 09:32 PM, Yaniv Gardi wrote:
>>> UFS driver's error handler forcefully tries to clear all the pending
>>> requests. For each pending request in the queue, it waits 1 sec for it
>>> to get cleared. If we have multiple requests in the queue then it's
>>> possible that we might end up waiting for those many seconds before
>>> resetting the host. But note that resetting host would any way clear
>>> all the pending requests from the hardware. Hence this change skips
>>> the forceful clear of the pending requests if we are anyway going to
>>> reset the host (for fatal errors).
>>>
>>> Signed-off-by: Subhash Jadavani <subhashj@codeaurora.org>
>>> Signed-off-by: Yaniv Gardi <ygardi@codeaurora.org>
>>>
>>> ---
>>>  drivers/scsi/ufs/ufshcd.c | 155
>>> +++++++++++++++++++++++++++++++++-------------
>>>  1 file changed, 112 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>>> index 987cf27..dc096f1 100644
>>> --- a/drivers/scsi/ufs/ufshcd.c
>>> +++ b/drivers/scsi/ufs/ufshcd.c
>>> @@ -133,9 +133,11 @@ enum {
>>>  /* UFSHCD UIC layer error flags */
>>>  enum {
>>>  	UFSHCD_UIC_DL_PA_INIT_ERROR = (1 << 0), /* Data link layer error */
>>> -	UFSHCD_UIC_NL_ERROR = (1 << 1), /* Network layer error */
>>> -	UFSHCD_UIC_TL_ERROR = (1 << 2), /* Transport Layer error */
>>> -	UFSHCD_UIC_DME_ERROR = (1 << 3), /* DME error */
>>> +	UFSHCD_UIC_DL_NAC_RECEIVED_ERROR = (1 << 1), /* Data link layer error
>>> */
>>> +	UFSHCD_UIC_DL_TCx_REPLAY_ERROR = (1 << 2), /* Data link layer error */
>>> +	UFSHCD_UIC_NL_ERROR = (1 << 3), /* Network layer error */
>>> +	UFSHCD_UIC_TL_ERROR = (1 << 4), /* Transport Layer error */
>>> +	UFSHCD_UIC_DME_ERROR = (1 << 5), /* DME error */
>>>  };
>>>
>>>  /* Interrupt configuration options */
>>> @@ -3465,31 +3467,18 @@ static void ufshcd_uic_cmd_compl(struct ufs_hba
>>> *hba, u32 intr_status)
>>>  }
>>>
>>>  /**
>>> - * ufshcd_transfer_req_compl - handle SCSI and query command completion
>>> + * __ufshcd_transfer_req_compl - handle SCSI and query command
>>> completion
>>>   * @hba: per adapter instance
>>> + * @completed_reqs: requests to complete
>>>   */
>>> -static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
>>> +static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
>>> +					unsigned long completed_reqs)
>>>  {
>>>  	struct ufshcd_lrb *lrbp;
>>>  	struct scsi_cmnd *cmd;
>>> -	unsigned long completed_reqs;
>>> -	u32 tr_doorbell;
>>>  	int result;
>>>  	int index;
>>>
>>> -	/* Resetting interrupt aggregation counters first and reading the
>>> -	 * DOOR_BELL afterward allows us to handle all the completed requests.
>>> -	 * In order to prevent other interrupts starvation the DB is read once
>>> -	 * after reset. The down side of this solution is the possibility of
>>> -	 * false interrupt if device completes another request after resetting
>>> -	 * aggregation and before reading the DB.
>>> -	 */
>>> -	if (ufshcd_is_intr_aggr_allowed(hba))
>>> -		ufshcd_reset_intr_aggr(hba);
>>> -
>>> -	tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>>> -	completed_reqs = tr_doorbell ^ hba->outstanding_reqs;
>>> -
>>>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>>>  		lrbp = &hba->lrb[index];
>>>  		cmd = lrbp->cmd;
>>> @@ -3519,6 +3508,31 @@ static void ufshcd_transfer_req_compl(struct
>>> ufs_hba *hba)
>>>  }
>>>
>>>  /**
>>> + * ufshcd_transfer_req_compl - handle SCSI and query command completion
>>> + * @hba: per adapter instance
>>> + */
>>> +static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
>>> +{
>>> +	unsigned long completed_reqs;
>>> +	u32 tr_doorbell;
>>> +
>>> +	/* Resetting interrupt aggregation counters first and reading the
>>> +	 * DOOR_BELL afterward allows us to handle all the completed requests.
>>> +	 * In order to prevent other interrupts starvation the DB is read once
>>> +	 * after reset. The down side of this solution is the possibility of
>>> +	 * false interrupt if device completes another request after resetting
>>> +	 * aggregation and before reading the DB.
>>> +	 */
>>> +	if (ufshcd_is_intr_aggr_allowed(hba))
>>> +		ufshcd_reset_intr_aggr(hba);
>>> +
>>> +	tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>>> +	completed_reqs = tr_doorbell ^ hba->outstanding_reqs;
>>> +
>>> +	__ufshcd_transfer_req_compl(hba, completed_reqs);
>>> +}
>>> +
>>> +/**
>>>   * ufshcd_disable_ee - disable exception event
>>>   * @hba: per-adapter instance
>>>   * @mask: exception event to disable
>>> @@ -3773,6 +3787,13 @@ out:
>>>  	return;
>>>  }
>>>
>>> +/* Complete requests that have door-bell cleared */
>>> +static void ufshcd_complete_requests(struct ufs_hba *hba)
>>> +{
>>> +	ufshcd_transfer_req_compl(hba);
>>> +	ufshcd_tmc_handler(hba);
>>> +}
>>> +
>>>  /**
>>>   * ufshcd_err_handler - handle UFS errors that require s/w attention
>>>   * @work: pointer to work structure
>>> @@ -3785,6 +3806,7 @@ static void ufshcd_err_handler(struct work_struct
>>> *work)
>>>  	u32 err_tm = 0;
>>>  	int err = 0;
>>>  	int tag;
>>> +	bool needs_reset = false;
>>>
>>>  	hba = container_of(work, struct ufs_hba, eh_work);
>>>
>>> @@ -3792,40 +3814,75 @@ static void ufshcd_err_handler(struct
>>> work_struct *work)
>>>  	ufshcd_hold(hba, false);
>>>
>>>  	spin_lock_irqsave(hba->host->host_lock, flags);
>>> -	if (hba->ufshcd_state == UFSHCD_STATE_RESET) {
>>> -		spin_unlock_irqrestore(hba->host->host_lock, flags);
>>> +	if (hba->ufshcd_state == UFSHCD_STATE_RESET)
>>>  		goto out;
>>> -	}
>>>
>>>  	hba->ufshcd_state = UFSHCD_STATE_RESET;
>>>  	ufshcd_set_eh_in_progress(hba);
>>>
>>>  	/* Complete requests that have door-bell cleared by h/w */
>>> -	ufshcd_transfer_req_compl(hba);
>>> -	ufshcd_tmc_handler(hba);
>>> -	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>> +	ufshcd_complete_requests(hba);
>>> +	if ((hba->saved_err & INT_FATAL_ERRORS) ||
>>> +	    ((hba->saved_err & UIC_ERROR) &&
>>> +	    (hba->saved_uic_err & (UFSHCD_UIC_DL_PA_INIT_ERROR |
>>> +				   UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
>>> +				   UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
>>> +		needs_reset = true;
>>>
>>> +	/*
>>> +	 * if host reset is required then skip clearing the pending
>>> +	 * transfers forcefully because they will automatically get
>>> +	 * cleared after link startup.
>>> +	 */
>>> +	if (needs_reset)
>>> +		goto skip_pending_xfer_clear;
>>> +
>>> +	/* release lock as clear command might sleep */
>>> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>>  	/* Clear pending transfer requests */
>>> -	for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs)
>>> -		if (ufshcd_clear_cmd(hba, tag))
>>> -			err_xfer |= 1 << tag;
>>> +	for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) {
>>> +		if (ufshcd_clear_cmd(hba, tag)) {
>>> +			err_xfer = true;
>>> +			goto lock_skip_pending_xfer_clear;
>>> +		}
>>> +	}
>>>
>>>  	/* Clear pending task management requests */
>>> -	for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs)
>>> -		if (ufshcd_clear_tm_cmd(hba, tag))
>>> -			err_tm |= 1 << tag;
>>> +	for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) {
>>> +		if (ufshcd_clear_tm_cmd(hba, tag)) {
>>> +			err_tm = true;
>>> +			goto lock_skip_pending_xfer_clear;
>>> +		}
>>> +	}
>>>
>>> -	/* Complete the requests that are cleared by s/w */
>>> +lock_skip_pending_xfer_clear:
>>>  	spin_lock_irqsave(hba->host->host_lock, flags);
>>> -	ufshcd_transfer_req_compl(hba);
>>> -	ufshcd_tmc_handler(hba);
>>> -	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>>
>>> +	/* Complete the requests that are cleared by s/w */
>>> +	ufshcd_complete_requests(hba);
>>> +
>>> +	if (err_xfer || err_tm)
>>> +		needs_reset = true;
>>> +
>>> +skip_pending_xfer_clear:
>>>  	/* Fatal errors need reset */
>>> -	if (err_xfer || err_tm || (hba->saved_err & INT_FATAL_ERRORS) ||
>>> -			((hba->saved_err & UIC_ERROR) &&
>>> -			 (hba->saved_uic_err & UFSHCD_UIC_DL_PA_INIT_ERROR))) {
>>> +	if (needs_reset) {
>>> +		unsigned long max_doorbells = (1UL << hba->nutrs) - 1;
>>> +
>>> +		/*
>>> +		 * ufshcd_reset_and_restore() does the link reinitialization
>>> +		 * which will need atleast one empty doorbell slot to send the
>>> +		 * device management commands (NOP and query commands).
>>> +		 * If there is no slot empty at this moment then free up last
>>> +		 * slot forcefully.
>>> +		 */
>>> +		if (hba->outstanding_reqs == max_doorbells)
>>> +			__ufshcd_transfer_req_compl(hba,
>>> +						    (1UL << (hba->nutrs - 1)));
>>> +
>>> +		spin_unlock_irqrestore(hba->host->host_lock, flags);
>>>  		err = ufshcd_reset_and_restore(hba);
>>> +		spin_lock_irqsave(hba->host->host_lock, flags);
>>>  		if (err) {
>>>  			dev_err(hba->dev, "%s: reset and restore failed\n",
>>>  					__func__);
>> Why don't you reserve a command slot for this case (ie reduce the number
>> of tags by one)?
>> That way you would always have at least one slot free, wouldn't you?
>>
> 
> Hello Hannes,
> 
> We are discussing here, a very-very rare scenario where 2 conditions must
> co-exist:
> 1. a fatal error that requires the controller to be reset.
> 2. all slots are taken.
> 
> This 2 conditions, very rarely should happen together.
> At that point, it would be better to free the last slot, than to save one
> slot for this scenario.
> Also, we should remember that reducing the queue-depth for the entire
> usual operation, might be a performance hit at some point, where, for
> example, a LUN has only 8 slots, will now have 7, which is a hit of 12.5%
> of the potential parallelism. So, i would recommend to stick with the
> above proposal that not only makes more sense when you estimate the
> probability of the conditions to co-exist, but also was tested and proven
> to be safe with no performance hit.
> 
Okay, good point. I didn't know how many slots there are.
But if you only got 8 you surely don't want to waste one.

Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

next prev parent reply	other threads:[~2016-03-01 10:03 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-28 13:32 [PATCH v5 00/15] add fixes, device quirks, error recovery, Yaniv Gardi
2016-02-28 13:32 ` [PATCH v5 01/15] scsi: ufs-qcom: add number of lanes per direction Yaniv Gardi
     [not found]   ` <1456666367-11418-2-git-send-email-ygardi-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2016-03-01  5:08     ` Hannes Reinecke
2016-03-03 22:18   ` Rob Herring
2016-02-28 13:32 ` [PATCH v5 02/15] scsi: ufs: avoid spurious UFS host controller interrupts Yaniv Gardi
2016-03-01  5:10   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 03/15] scsi: ufs: implement scsi host timeout handler Yaniv Gardi
2016-03-01  7:29   ` Hannes Reinecke
2016-03-01 13:25     ` ygardi
2016-03-03  7:22       ` Hannes Reinecke
2016-03-03  9:10         ` ygardi
2016-03-03 12:53           ` Hannes Reinecke
2016-03-06 10:33             ` ygardi
2016-03-08 11:48               ` ygardi
2016-03-08 11:48               ` ygardi
2016-03-08 12:26                 ` Dolev Raviv
2016-02-28 13:32 ` [PATCH v5 04/15] scsi: ufs: verify hba controller hce reg value Yaniv Gardi
2016-03-01  7:32   ` Hannes Reinecke
2016-03-01 13:32     ` ygardi
2016-03-03  7:24       ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 05/15] scsi: ufs: add support to read device and string descriptors Yaniv Gardi
2016-03-01  7:35   ` Hannes Reinecke
2016-03-01 10:01     ` ygardi
2016-03-01 10:03       ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 06/15] scsi: ufs: separate device and host quirks Yaniv Gardi
2016-03-01  7:38   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 07/15] scsi: ufs: disable vccq if it's not needed by UFS device Yaniv Gardi
2016-03-01  7:36   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 08/15] scsi: ufs: make error handling bit faster Yaniv Gardi
2016-03-01  7:50   ` Hannes Reinecke
2016-03-01  9:56     ` ygardi
2016-03-01 10:02       ` Hannes Reinecke [this message]
2016-02-28 13:32 ` [PATCH v5 09/15] scsi: ufs: add error recovery after DL NAC error Yaniv Gardi
2016-03-01  7:51   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 10/15] scsi: ufs: add retry for query descriptors Yaniv Gardi
2016-03-01  7:53   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 11/15] scsi: ufs: handle non spec compliant bkops behaviour by device Yaniv Gardi
2016-03-01  7:54   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 12/15] scsi: ufs: tune UniPro parameters to optimize hibern8 exit time Yaniv Gardi
2016-03-01  7:55   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 13/15] scsi: ufs: fix leakage during link off state Yaniv Gardi
2016-03-01  7:56   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 14/15] scsi: ufs: add device quirk delay before putting UFS rails in LPM Yaniv Gardi
2016-03-01  7:57   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 15/15] scsi: ufs-qcom: set PA_Local_TX_LCC_Enable before link startup Yaniv Gardi
2016-03-01  7:58   ` Hannes Reinecke
2016-03-06 11:57     ` ygardi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=56D568CE.9020104@suse.de \
    --to=hare@suse.de \
    --cc=james.bottomley@hansenpartnership.com \
    --cc=jbottomley@odin.com \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi-owner@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=santoshsy@gmail.com \
    --cc=subhashj@codeaurora.org \
    --cc=vinholikatti@gmail.com \
    --cc=ygardi@codeaurora.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).