linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: ygardi@codeaurora.org
To: Hannes Reinecke <hare@suse.de>
Cc: Yaniv Gardi <ygardi@codeaurora.org>,
	james.bottomley@hansenpartnership.com,
	linux-kernel@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-arm-msm@vger.kernel.org, santoshsy@gmail.com,
	linux-scsi-owner@vger.kernel.org,
	Subhash Jadavani <subhashj@codeaurora.org>,
	Vinayak Holikatti <vinholikatti@gmail.com>,
	"James E.J. Bottomley" <jbottomley@odin.com>,
	"Martin K. Petersen" <martin.petersen@oracle.com>
Subject: Re: [PATCH v5 08/15] scsi: ufs: make error handling bit faster
Date: Tue, 1 Mar 2016 09:56:40 -0000	[thread overview]
Message-ID: <187f423aa64d09d32c6c136e28c51fc2.squirrel@us.codeaurora.org> (raw)
In-Reply-To: <56D549CD.70104@suse.de>

> On 02/28/2016 09:32 PM, Yaniv Gardi wrote:
>> UFS driver's error handler forcefully tries to clear all the pending
>> requests. For each pending request in the queue, it waits 1 sec for it
>> to get cleared. If we have multiple requests in the queue then it's
>> possible that we might end up waiting for those many seconds before
>> resetting the host. But note that resetting host would any way clear
>> all the pending requests from the hardware. Hence this change skips
>> the forceful clear of the pending requests if we are anyway going to
>> reset the host (for fatal errors).
>>
>> Signed-off-by: Subhash Jadavani <subhashj@codeaurora.org>
>> Signed-off-by: Yaniv Gardi <ygardi@codeaurora.org>
>>
>> ---
>>  drivers/scsi/ufs/ufshcd.c | 155
>> +++++++++++++++++++++++++++++++++-------------
>>  1 file changed, 112 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
>> index 987cf27..dc096f1 100644
>> --- a/drivers/scsi/ufs/ufshcd.c
>> +++ b/drivers/scsi/ufs/ufshcd.c
>> @@ -133,9 +133,11 @@ enum {
>>  /* UFSHCD UIC layer error flags */
>>  enum {
>>  	UFSHCD_UIC_DL_PA_INIT_ERROR = (1 << 0), /* Data link layer error */
>> -	UFSHCD_UIC_NL_ERROR = (1 << 1), /* Network layer error */
>> -	UFSHCD_UIC_TL_ERROR = (1 << 2), /* Transport Layer error */
>> -	UFSHCD_UIC_DME_ERROR = (1 << 3), /* DME error */
>> +	UFSHCD_UIC_DL_NAC_RECEIVED_ERROR = (1 << 1), /* Data link layer error
>> */
>> +	UFSHCD_UIC_DL_TCx_REPLAY_ERROR = (1 << 2), /* Data link layer error */
>> +	UFSHCD_UIC_NL_ERROR = (1 << 3), /* Network layer error */
>> +	UFSHCD_UIC_TL_ERROR = (1 << 4), /* Transport Layer error */
>> +	UFSHCD_UIC_DME_ERROR = (1 << 5), /* DME error */
>>  };
>>
>>  /* Interrupt configuration options */
>> @@ -3465,31 +3467,18 @@ static void ufshcd_uic_cmd_compl(struct ufs_hba
>> *hba, u32 intr_status)
>>  }
>>
>>  /**
>> - * ufshcd_transfer_req_compl - handle SCSI and query command completion
>> + * __ufshcd_transfer_req_compl - handle SCSI and query command
>> completion
>>   * @hba: per adapter instance
>> + * @completed_reqs: requests to complete
>>   */
>> -static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
>> +static void __ufshcd_transfer_req_compl(struct ufs_hba *hba,
>> +					unsigned long completed_reqs)
>>  {
>>  	struct ufshcd_lrb *lrbp;
>>  	struct scsi_cmnd *cmd;
>> -	unsigned long completed_reqs;
>> -	u32 tr_doorbell;
>>  	int result;
>>  	int index;
>>
>> -	/* Resetting interrupt aggregation counters first and reading the
>> -	 * DOOR_BELL afterward allows us to handle all the completed requests.
>> -	 * In order to prevent other interrupts starvation the DB is read once
>> -	 * after reset. The down side of this solution is the possibility of
>> -	 * false interrupt if device completes another request after resetting
>> -	 * aggregation and before reading the DB.
>> -	 */
>> -	if (ufshcd_is_intr_aggr_allowed(hba))
>> -		ufshcd_reset_intr_aggr(hba);
>> -
>> -	tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>> -	completed_reqs = tr_doorbell ^ hba->outstanding_reqs;
>> -
>>  	for_each_set_bit(index, &completed_reqs, hba->nutrs) {
>>  		lrbp = &hba->lrb[index];
>>  		cmd = lrbp->cmd;
>> @@ -3519,6 +3508,31 @@ static void ufshcd_transfer_req_compl(struct
>> ufs_hba *hba)
>>  }
>>
>>  /**
>> + * ufshcd_transfer_req_compl - handle SCSI and query command completion
>> + * @hba: per adapter instance
>> + */
>> +static void ufshcd_transfer_req_compl(struct ufs_hba *hba)
>> +{
>> +	unsigned long completed_reqs;
>> +	u32 tr_doorbell;
>> +
>> +	/* Resetting interrupt aggregation counters first and reading the
>> +	 * DOOR_BELL afterward allows us to handle all the completed requests.
>> +	 * In order to prevent other interrupts starvation the DB is read once
>> +	 * after reset. The down side of this solution is the possibility of
>> +	 * false interrupt if device completes another request after resetting
>> +	 * aggregation and before reading the DB.
>> +	 */
>> +	if (ufshcd_is_intr_aggr_allowed(hba))
>> +		ufshcd_reset_intr_aggr(hba);
>> +
>> +	tr_doorbell = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
>> +	completed_reqs = tr_doorbell ^ hba->outstanding_reqs;
>> +
>> +	__ufshcd_transfer_req_compl(hba, completed_reqs);
>> +}
>> +
>> +/**
>>   * ufshcd_disable_ee - disable exception event
>>   * @hba: per-adapter instance
>>   * @mask: exception event to disable
>> @@ -3773,6 +3787,13 @@ out:
>>  	return;
>>  }
>>
>> +/* Complete requests that have door-bell cleared */
>> +static void ufshcd_complete_requests(struct ufs_hba *hba)
>> +{
>> +	ufshcd_transfer_req_compl(hba);
>> +	ufshcd_tmc_handler(hba);
>> +}
>> +
>>  /**
>>   * ufshcd_err_handler - handle UFS errors that require s/w attention
>>   * @work: pointer to work structure
>> @@ -3785,6 +3806,7 @@ static void ufshcd_err_handler(struct work_struct
>> *work)
>>  	u32 err_tm = 0;
>>  	int err = 0;
>>  	int tag;
>> +	bool needs_reset = false;
>>
>>  	hba = container_of(work, struct ufs_hba, eh_work);
>>
>> @@ -3792,40 +3814,75 @@ static void ufshcd_err_handler(struct
>> work_struct *work)
>>  	ufshcd_hold(hba, false);
>>
>>  	spin_lock_irqsave(hba->host->host_lock, flags);
>> -	if (hba->ufshcd_state == UFSHCD_STATE_RESET) {
>> -		spin_unlock_irqrestore(hba->host->host_lock, flags);
>> +	if (hba->ufshcd_state == UFSHCD_STATE_RESET)
>>  		goto out;
>> -	}
>>
>>  	hba->ufshcd_state = UFSHCD_STATE_RESET;
>>  	ufshcd_set_eh_in_progress(hba);
>>
>>  	/* Complete requests that have door-bell cleared by h/w */
>> -	ufshcd_transfer_req_compl(hba);
>> -	ufshcd_tmc_handler(hba);
>> -	spin_unlock_irqrestore(hba->host->host_lock, flags);
>> +	ufshcd_complete_requests(hba);
>> +	if ((hba->saved_err & INT_FATAL_ERRORS) ||
>> +	    ((hba->saved_err & UIC_ERROR) &&
>> +	    (hba->saved_uic_err & (UFSHCD_UIC_DL_PA_INIT_ERROR |
>> +				   UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
>> +				   UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
>> +		needs_reset = true;
>>
>> +	/*
>> +	 * if host reset is required then skip clearing the pending
>> +	 * transfers forcefully because they will automatically get
>> +	 * cleared after link startup.
>> +	 */
>> +	if (needs_reset)
>> +		goto skip_pending_xfer_clear;
>> +
>> +	/* release lock as clear command might sleep */
>> +	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>  	/* Clear pending transfer requests */
>> -	for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs)
>> -		if (ufshcd_clear_cmd(hba, tag))
>> -			err_xfer |= 1 << tag;
>> +	for_each_set_bit(tag, &hba->outstanding_reqs, hba->nutrs) {
>> +		if (ufshcd_clear_cmd(hba, tag)) {
>> +			err_xfer = true;
>> +			goto lock_skip_pending_xfer_clear;
>> +		}
>> +	}
>>
>>  	/* Clear pending task management requests */
>> -	for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs)
>> -		if (ufshcd_clear_tm_cmd(hba, tag))
>> -			err_tm |= 1 << tag;
>> +	for_each_set_bit(tag, &hba->outstanding_tasks, hba->nutmrs) {
>> +		if (ufshcd_clear_tm_cmd(hba, tag)) {
>> +			err_tm = true;
>> +			goto lock_skip_pending_xfer_clear;
>> +		}
>> +	}
>>
>> -	/* Complete the requests that are cleared by s/w */
>> +lock_skip_pending_xfer_clear:
>>  	spin_lock_irqsave(hba->host->host_lock, flags);
>> -	ufshcd_transfer_req_compl(hba);
>> -	ufshcd_tmc_handler(hba);
>> -	spin_unlock_irqrestore(hba->host->host_lock, flags);
>>
>> +	/* Complete the requests that are cleared by s/w */
>> +	ufshcd_complete_requests(hba);
>> +
>> +	if (err_xfer || err_tm)
>> +		needs_reset = true;
>> +
>> +skip_pending_xfer_clear:
>>  	/* Fatal errors need reset */
>> -	if (err_xfer || err_tm || (hba->saved_err & INT_FATAL_ERRORS) ||
>> -			((hba->saved_err & UIC_ERROR) &&
>> -			 (hba->saved_uic_err & UFSHCD_UIC_DL_PA_INIT_ERROR))) {
>> +	if (needs_reset) {
>> +		unsigned long max_doorbells = (1UL << hba->nutrs) - 1;
>> +
>> +		/*
>> +		 * ufshcd_reset_and_restore() does the link reinitialization
>> +		 * which will need atleast one empty doorbell slot to send the
>> +		 * device management commands (NOP and query commands).
>> +		 * If there is no slot empty at this moment then free up last
>> +		 * slot forcefully.
>> +		 */
>> +		if (hba->outstanding_reqs == max_doorbells)
>> +			__ufshcd_transfer_req_compl(hba,
>> +						    (1UL << (hba->nutrs - 1)));
>> +
>> +		spin_unlock_irqrestore(hba->host->host_lock, flags);
>>  		err = ufshcd_reset_and_restore(hba);
>> +		spin_lock_irqsave(hba->host->host_lock, flags);
>>  		if (err) {
>>  			dev_err(hba->dev, "%s: reset and restore failed\n",
>>  					__func__);
> Why don't you reserve a command slot for this case (ie reduce the number
> of tags by one)?
> That way you would always have at least one slot free, wouldn't you?
>

Hello Hannes,

We are discussing here, a very-very rare scenario where 2 conditions must
co-exist:
1. a fatal error that requires the controller to be reset.
2. all slots are taken.

This 2 conditions, very rarely should happen together.
At that point, it would be better to free the last slot, than to save one
slot for this scenario.
Also, we should remember that reducing the queue-depth for the entire
usual operation, might be a performance hit at some point, where, for
example, a LUN has only 8 slots, will now have 7, which is a hit of 12.5%
of the potential parallelism. So, i would recommend to stick with the
above proposal that not only makes more sense when you estimate the
probability of the conditions to co-exist, but also was tested and proven
to be safe with no performance hit.

Regards,
Yaniv




> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke		      zSeries & Storage
> hare@suse.de			      +49 911 74053 688
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

  reply	other threads:[~2016-03-01  9:56 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-02-28 13:32 [PATCH v5 00/15] add fixes, device quirks, error recovery, Yaniv Gardi
2016-02-28 13:32 ` [PATCH v5 01/15] scsi: ufs-qcom: add number of lanes per direction Yaniv Gardi
     [not found]   ` <1456666367-11418-2-git-send-email-ygardi-sgV2jX0FEOL9JmXXK+q4OQ@public.gmane.org>
2016-03-01  5:08     ` Hannes Reinecke
2016-03-03 22:18   ` Rob Herring
2016-02-28 13:32 ` [PATCH v5 02/15] scsi: ufs: avoid spurious UFS host controller interrupts Yaniv Gardi
2016-03-01  5:10   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 03/15] scsi: ufs: implement scsi host timeout handler Yaniv Gardi
2016-03-01  7:29   ` Hannes Reinecke
2016-03-01 13:25     ` ygardi
2016-03-03  7:22       ` Hannes Reinecke
2016-03-03  9:10         ` ygardi
2016-03-03 12:53           ` Hannes Reinecke
2016-03-06 10:33             ` ygardi
2016-03-08 11:48               ` ygardi
2016-03-08 11:48               ` ygardi
2016-03-08 12:26                 ` Dolev Raviv
2016-02-28 13:32 ` [PATCH v5 04/15] scsi: ufs: verify hba controller hce reg value Yaniv Gardi
2016-03-01  7:32   ` Hannes Reinecke
2016-03-01 13:32     ` ygardi
2016-03-03  7:24       ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 05/15] scsi: ufs: add support to read device and string descriptors Yaniv Gardi
2016-03-01  7:35   ` Hannes Reinecke
2016-03-01 10:01     ` ygardi
2016-03-01 10:03       ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 06/15] scsi: ufs: separate device and host quirks Yaniv Gardi
2016-03-01  7:38   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 07/15] scsi: ufs: disable vccq if it's not needed by UFS device Yaniv Gardi
2016-03-01  7:36   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 08/15] scsi: ufs: make error handling bit faster Yaniv Gardi
2016-03-01  7:50   ` Hannes Reinecke
2016-03-01  9:56     ` ygardi [this message]
2016-03-01 10:02       ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 09/15] scsi: ufs: add error recovery after DL NAC error Yaniv Gardi
2016-03-01  7:51   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 10/15] scsi: ufs: add retry for query descriptors Yaniv Gardi
2016-03-01  7:53   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 11/15] scsi: ufs: handle non spec compliant bkops behaviour by device Yaniv Gardi
2016-03-01  7:54   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 12/15] scsi: ufs: tune UniPro parameters to optimize hibern8 exit time Yaniv Gardi
2016-03-01  7:55   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 13/15] scsi: ufs: fix leakage during link off state Yaniv Gardi
2016-03-01  7:56   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 14/15] scsi: ufs: add device quirk delay before putting UFS rails in LPM Yaniv Gardi
2016-03-01  7:57   ` Hannes Reinecke
2016-02-28 13:32 ` [PATCH v5 15/15] scsi: ufs-qcom: set PA_Local_TX_LCC_Enable before link startup Yaniv Gardi
2016-03-01  7:58   ` Hannes Reinecke
2016-03-06 11:57     ` ygardi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=187f423aa64d09d32c6c136e28c51fc2.squirrel@us.codeaurora.org \
    --to=ygardi@codeaurora.org \
    --cc=hare@suse.de \
    --cc=james.bottomley@hansenpartnership.com \
    --cc=jbottomley@odin.com \
    --cc=linux-arm-msm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi-owner@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=santoshsy@gmail.com \
    --cc=subhashj@codeaurora.org \
    --cc=vinholikatti@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).